alpcentaur
/
basabuuka_prototyp


								# cython: infer_types=True

								# cython: profile=True

								# coding: utf8

								from __future__ import unicode_literals


								import numpy

								cimport numpy as np

								import cytoolz

								from collections import OrderedDict

								import ujson


								from .util import msgpack

								from .util import msgpack_numpy


								from thinc.api import chain

								from thinc.v2v import Affine, SELU, Softmax

								from thinc.t2v import Pooling, max_pool, mean_pool

								from thinc.neural.util import to_categorical, copy_array

								from thinc.neural._classes.difference import Siamese, CauchySimilarity


								from .tokens.doc cimport Doc

								from .syntax.nn_parser cimport Parser

								from .syntax import nonproj

								from .syntax.ner cimport BiluoPushDown

								from .syntax.arc_eager cimport ArcEager

								from .morphology cimport Morphology

								from .vocab cimport Vocab

								from .syntax import nonproj

								from .compat import json_dumps


								from .attrs import POS

								from .parts_of_speech import X

								from ._ml import Tok2Vec, build_text_classifier, build_tagger_model

								from ._ml import link_vectors_to_models, zero_init, flatten

								from ._ml import create_default_optimizer

								from .errors import Errors, TempErrors

								from . import util


								class SentenceSegmenter(object):

								    """A simple spaCy hook, to allow custom sentence boundary detection logic

								    (that doesn't require the dependency parse). To change the sentence

								    boundary detection strategy, pass a generator function `strategy` on

								    initialization, or assign a new strategy to the .strategy attribute.

								    Sentence detection strategies should be generators that take `Doc` objects

								    and yield `Span` objects for each sentence.

								    """

								    name = 'sbd'


								    def __init__(self, vocab, strategy=None):

								        self.vocab = vocab

								        if strategy is None or strategy == 'on_punct':

								            strategy = self.split_on_punct

								        self.strategy = strategy


								    def __call__(self, doc):

								        doc.user_hooks['sents'] = self.strategy

								        return doc


								    @staticmethod

								    def split_on_punct(doc):

								        start = 0

								        seen_period = False

								        for i, word in enumerate(doc):

								            if seen_period and not word.is_punct:

								                yield doc[start:word.i]

								                start = word.i

								                seen_period = False

								            elif word.text in ['.', '!', '?']:

								                seen_period = True

								        if start < len(doc):

								            yield doc[start:len(doc)]


								def merge_noun_chunks(doc):

								    """Merge noun chunks into a single token.


								    doc (Doc): The Doc object.

								    RETURNS (Doc): The Doc object with merged noun chunks.

								    """

								    if not doc.is_parsed:

								        return doc

								    spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)

								             for np in doc.noun_chunks]

								    for start, end, tag, dep in spans:

								        doc.merge(start, end, tag=tag, dep=dep)

								    return doc


								def merge_entities(doc):

								    """Merge entities into a single token.


								    doc (Doc): The Doc object.

								    RETURNS (Doc): The Doc object with merged noun entities.

								    """

								    spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)

								             for e in doc.ents]

								    for start, end, tag, dep, ent_type in spans:

								        doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)

								    return doc


								class Pipe(object):

								    """This class is not instantiated directly. Components inherit from it, and

								    it defines the interface that components should follow to function as

								    components in a spaCy analysis pipeline.

								    """

								    name = None


								    @classmethod

								    def Model(cls, *shape, **kwargs):

								        """Initialize a model for the pipe."""

								        raise NotImplementedError


								    def __init__(self, vocab, model=True, **cfg):

								        """Create a new pipe instance."""

								        raise NotImplementedError


								    def __call__(self, doc):

								        """Apply the pipe to one document. The document is

								        modified in-place, and returned.


								        Both __call__ and pipe should delegate to the `predict()`

								        and `set_annotations()` methods.

								        """

								        scores, tensors = self.predict([doc])

								        self.set_annotations([doc], scores, tensors=tensors)

								        return doc


								    def pipe(self, stream, batch_size=128, n_threads=-1):

								        """Apply the pipe to a stream of documents.


								        Both __call__ and pipe should delegate to the `predict()`

								        and `set_annotations()` methods.

								        """

								        for docs in cytoolz.partition_all(batch_size, stream):

								            docs = list(docs)

								            scores, tensors = self.predict(docs)

								            self.set_annotations(docs, scores, tensor=tensors)

								            yield from docs


								    def predict(self, docs):

								        """Apply the pipeline's model to a batch of docs, without

								        modifying them.

								        """

								        raise NotImplementedError


								    def set_annotations(self, docs, scores, tensors=None):

								        """Modify a batch of documents, using pre-computed scores."""

								        raise NotImplementedError


								    def update(self, docs, golds, drop=0., sgd=None, losses=None):

								        """Learn from a batch of documents and gold-standard information,

								        updating the pipe's model.


								        Delegates to predict() and get_loss().

								        """

								        raise NotImplementedError


								    def get_loss(self, docs, golds, scores):

								        """Find the loss and gradient of loss for the batch of

								        documents and their predicted scores."""

								        raise NotImplementedError


								    def add_label(self, label):

								        """Add an output label, to be predicted by the model.


								        It's possible to extend pre-trained models with new labels,

								        but care should be taken to avoid the "catastrophic forgetting"

								        problem.

								        """

								        raise NotImplementedError


								    def create_optimizer(self):

								        return create_default_optimizer(self.model.ops,

								                                        **self.cfg.get('optimizer', {}))


								    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,

								                       **kwargs):

								        """Initialize the pipe for training, using data exampes if available.

								        If no model has been initialized yet, the model is added."""

								        if self.model is True:

								            self.model = self.Model(**self.cfg)

								        link_vectors_to_models(self.vocab)

								        if sgd is None:

								            sgd = self.create_optimizer()

								        return sgd


								    def use_params(self, params):

								        """Modify the pipe's model, to use the given parameter values."""

								        with self.model.use_params(params):

								            yield


								    def to_bytes(self, **exclude):

								        """Serialize the pipe to a bytestring."""

								        serialize = OrderedDict()

								        serialize['cfg'] = lambda: json_dumps(self.cfg)

								        if self.model in (True, False, None):

								            serialize['model'] = lambda: self.model

								        else:

								            serialize['model'] = self.model.to_bytes

								        serialize['vocab'] = self.vocab.to_bytes

								        return util.to_bytes(serialize, exclude)


								    def from_bytes(self, bytes_data, **exclude):

								        """Load the pipe from a bytestring."""

								        def load_model(b):

								            # TODO: Remove this once we don't have to handle previous models

								            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:

								                self.cfg['pretrained_vectors'] = self.vocab.vectors.name

								            if self.model is True:

								                self.model = self.Model(**self.cfg)

								            self.model.from_bytes(b)


								        deserialize = OrderedDict((

								            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),

								            ('vocab', lambda b: self.vocab.from_bytes(b)),

								            ('model', load_model),

								        ))

								        util.from_bytes(bytes_data, deserialize, exclude)

								        return self


								    def to_disk(self, path, **exclude):

								        """Serialize the pipe to disk."""

								        serialize = OrderedDict()

								        serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))

								        serialize['vocab'] = lambda p: self.vocab.to_disk(p)

								        if self.model not in (None, True, False):

								            serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())

								        util.to_disk(path, serialize, exclude)


								    def from_disk(self, path, **exclude):

								        """Load the pipe from disk."""

								        def load_model(p):

								            # TODO: Remove this once we don't have to handle previous models

								            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:

								                self.cfg['pretrained_vectors'] = self.vocab.vectors.name

								            if self.model is True:

								                self.model = self.Model(**self.cfg)

								            self.model.from_bytes(p.open('rb').read())


								        deserialize = OrderedDict((

								            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),

								            ('vocab', lambda p: self.vocab.from_disk(p)),

								            ('model', load_model),

								        ))

								        util.from_disk(path, deserialize, exclude)

								        return self


								def _load_cfg(path):

								    if path.exists():

								        with path.open() as file_:

								            return ujson.load(file_)

								    else:

								        return {}


								class Tensorizer(Pipe):

								    """Assign position-sensitive vectors to tokens, using a CNN or RNN."""

								    name = 'tensorizer'


								    @classmethod

								    def Model(cls, output_size=300, input_size=384, **cfg):

								        """Create a new statistical model for the class.


								        width (int): Output size of the model.

								        embed_size (int): Number of vectors in the embedding table.

								        **cfg: Config parameters.

								        RETURNS (Model): A `thinc.neural.Model` or similar instance.

								        """

								        model = chain(

								                    SELU(output_size, input_size),

								                    SELU(output_size, output_size),

								                    zero_init(Affine(output_size, output_size)))

								        return model


								    def __init__(self, vocab, model=True, **cfg):

								        """Construct a new statistical model. Weights are not allocated on

								        initialisation.


								        vocab (Vocab): A `Vocab` instance. The model must share the same

								            `Vocab` instance with the `Doc` objects it will process.

								        model (Model): A `Model` instance or `True` allocate one later.

								        **cfg: Config parameters.


								        EXAMPLE:

								            >>> from spacy.pipeline import TokenVectorEncoder

								            >>> tok2vec = TokenVectorEncoder(nlp.vocab)

								            >>> tok2vec.model = tok2vec.Model(128, 5000)

								        """

								        self.vocab = vocab

								        self.model = model

								        self.input_models = []

								        self.cfg = dict(cfg)

								        self.cfg.setdefault('cnn_maxout_pieces', 3)


								    def __call__(self, doc):

								        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM

								        model. Vectors are set to the `Doc.tensor` attribute.


								        docs (Doc or iterable): One or more documents to add vectors to.

								        RETURNS (dict or None): Intermediate computations.

								        """

								        tokvecses = self.predict([doc])

								        self.set_annotations([doc], tokvecses)

								        return doc


								    def pipe(self, stream, batch_size=128, n_threads=-1):

								        """Process `Doc` objects as a stream.


								        stream (iterator): A sequence of `Doc` objects to process.

								        batch_size (int): Number of `Doc` objects to group.

								        n_threads (int): Number of threads.

								        YIELDS (iterator): A sequence of `Doc` objects, in order of input.

								        """

								        for docs in cytoolz.partition_all(batch_size, stream):

								            docs = list(docs)

								            tensors = self.predict(docs)

								            self.set_annotations(docs, tensors)

								            yield from docs


								    def predict(self, docs):

								        """Return a single tensor for a batch of documents.


								        docs (iterable): A sequence of `Doc` objects.

								        RETURNS (object): Vector representations for each token in the docs.

								        """

								        inputs = self.model.ops.flatten([doc.tensor for doc in docs])

								        outputs = self.model(inputs)

								        return self.model.ops.unflatten(outputs, [len(d) for d in docs])


								    def set_annotations(self, docs, tensors):

								        """Set the tensor attribute for a batch of documents.


								        docs (iterable): A sequence of `Doc` objects.

								        tensors (object): Vector representation for each token in the docs.

								        """

								        for doc, tensor in zip(docs, tensors):

								            if tensor.shape[0] != len(doc):

								                raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))

								            doc.tensor = tensor


								    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):

								        """Update the model.


								        docs (iterable): A batch of `Doc` objects.

								        golds (iterable): A batch of `GoldParse` objects.

								        drop (float): The droput rate.

								        sgd (callable): An optimizer.

								        RETURNS (dict): Results from the update.

								        """

								        if isinstance(docs, Doc):

								            docs = [docs]

								        inputs = []

								        bp_inputs = []

								        for tok2vec in self.input_models:

								            tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)

								            inputs.append(tensor)

								            bp_inputs.append(bp_tensor)

								        inputs = self.model.ops.xp.hstack(inputs)

								        scores, bp_scores = self.model.begin_update(inputs, drop=drop)

								        loss, d_scores = self.get_loss(docs, golds, scores)

								        d_inputs = bp_scores(d_scores, sgd=sgd)

								        d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)

								        for d_input, bp_input in zip(d_inputs, bp_inputs):

								            bp_input(d_input, sgd=sgd)

								        if losses is not None:

								            losses.setdefault(self.name, 0.)

								            losses[self.name] += loss

								        return loss


								    def get_loss(self, docs, golds, prediction):

								        target = []

								        i = 0

								        for doc in docs:

								            vectors = self.model.ops.xp.vstack([w.vector for w in doc])

								            target.append(vectors)

								        target = self.model.ops.xp.vstack(target)

								        d_scores = (prediction - target) / prediction.shape[0]

								        loss = (d_scores**2).sum()

								        return loss, d_scores


								    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,

								                        **kwargs):

								        """Allocate models, pre-process training data and acquire an

								        optimizer.


								        gold_tuples (iterable): Gold-standard training data.

								        pipeline (list): The pipeline the model is part of.

								        """

								        for name, model in pipeline:

								            if getattr(model, 'tok2vec', None):

								                self.input_models.append(model.tok2vec)

								        if self.model is True:

								            self.cfg['input_size'] = 384

								            self.cfg['output_size'] = 300

								            self.model = self.Model(**self.cfg)

								        link_vectors_to_models(self.vocab)

								        if sgd is None:

								            sgd = self.create_optimizer()

								        return sgd


								class Tagger(Pipe):

								    name = 'tagger'


								    def __init__(self, vocab, model=True, **cfg):

								        self.vocab = vocab

								        self.model = model

								        self.cfg = OrderedDict(sorted(cfg.items()))

								        self.cfg.setdefault('cnn_maxout_pieces', 2)


								    @property

								    def labels(self):

								        return self.vocab.morphology.tag_names


								    @property

								    def tok2vec(self):

								        if self.model in (None, True, False):

								            return None

								        else:

								            return chain(self.model.tok2vec, flatten)


								    def __call__(self, doc):

								        tags, tokvecs = self.predict([doc])

								        self.set_annotations([doc], tags, tensors=tokvecs)

								        return doc


								    def pipe(self, stream, batch_size=128, n_threads=-1):

								        for docs in cytoolz.partition_all(batch_size, stream):

								            docs = list(docs)

								            tag_ids, tokvecs = self.predict(docs)

								            self.set_annotations(docs, tag_ids, tensors=tokvecs)

								            yield from docs


								    def predict(self, docs):

								        tokvecs = self.model.tok2vec(docs)

								        scores = self.model.softmax(tokvecs)

								        guesses = []

								        for doc_scores in scores:

								            doc_guesses = doc_scores.argmax(axis=1)

								            if not isinstance(doc_guesses, numpy.ndarray):

								                doc_guesses = doc_guesses.get()

								            guesses.append(doc_guesses)

								        return guesses, tokvecs


								    def set_annotations(self, docs, batch_tag_ids, tensors=None):

								        if isinstance(docs, Doc):

								            docs = [docs]

								        cdef Doc doc

								        cdef int idx = 0

								        cdef Vocab vocab = self.vocab

								        for i, doc in enumerate(docs):

								            doc_tag_ids = batch_tag_ids[i]

								            if hasattr(doc_tag_ids, 'get'):

								                doc_tag_ids = doc_tag_ids.get()

								            for j, tag_id in enumerate(doc_tag_ids):

								                # Don't clobber preset POS tags

								                if doc.c[j].tag == 0 and doc.c[j].pos == 0:

								                    # Don't clobber preset lemmas

								                    lemma = doc.c[j].lemma

								                    vocab.morphology.assign_tag_id(&doc.c[j], tag_id)

								                    if lemma != 0 and lemma != doc.c[j].lex.orth:

								                        doc.c[j].lemma = lemma

								                idx += 1

								            if tensors is not None:

								                if isinstance(doc.tensor, numpy.ndarray) \

								                and not isinstance(tensors[i], numpy.ndarray):

								                    doc.extend_tensor(tensors[i].get())

								                else:

								                    doc.extend_tensor(tensors[i])

								            doc.is_tagged = True


								    def update(self, docs, golds, drop=0., sgd=None, losses=None):

								        if losses is not None and self.name not in losses:

								            losses[self.name] = 0.


								        tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)

								        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)

								        bp_tag_scores(d_tag_scores, sgd=sgd)


								        if losses is not None:

								            losses[self.name] += loss


								    def get_loss(self, docs, golds, scores):

								        scores = self.model.ops.flatten(scores)

								        tag_index = {tag: i for i, tag in enumerate(self.labels)}

								        cdef int idx = 0

								        correct = numpy.zeros((scores.shape[0],), dtype='i')

								        guesses = scores.argmax(axis=1)

								        for gold in golds:

								            for tag in gold.tags:

								                if tag is None:

								                    correct[idx] = guesses[idx]

								                else:

								                    correct[idx] = tag_index[tag]

								                idx += 1

								        correct = self.model.ops.xp.array(correct, dtype='i')

								        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])

								        d_scores /= d_scores.shape[0]

								        loss = (d_scores**2).sum()

								        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])

								        return float(loss), d_scores


								    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,

								                       **kwargs):

								        orig_tag_map = dict(self.vocab.morphology.tag_map)

								        new_tag_map = OrderedDict()

								        for raw_text, annots_brackets in gold_tuples:

								            for annots, brackets in annots_brackets:

								                ids, words, tags, heads, deps, ents = annots

								                for tag in tags:

								                    if tag in orig_tag_map:

								                        new_tag_map[tag] = orig_tag_map[tag]

								                    else:

								                        new_tag_map[tag] = {POS: X}

								        cdef Vocab vocab = self.vocab

								        if new_tag_map:

								            vocab.morphology = Morphology(vocab.strings, new_tag_map,

								                                          vocab.morphology.lemmatizer,

								                                          exc=vocab.morphology.exc)

								        self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')

								        if self.model is True:

								            self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)

								        link_vectors_to_models(self.vocab)

								        if sgd is None:

								            sgd = self.create_optimizer()

								        return sgd


								    @classmethod

								    def Model(cls, n_tags, **cfg):

								        if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):

								            raise ValueError(TempErrors.T008)

								        return build_tagger_model(n_tags, **cfg)


								    def add_label(self, label, values=None):

								        if label in self.labels:

								            return 0

								        if self.model not in (True, False, None):

								            # Here's how the model resizing will work, once the

								            # neuron-to-tag mapping is no longer controlled by

								            # the Morphology class, which sorts the tag names.

								            # The sorting makes adding labels difficult.

								            # smaller = self.model._layers[-1]

								            # larger = Softmax(len(self.labels)+1, smaller.nI)

								            # copy_array(larger.W[:smaller.nO], smaller.W)

								            # copy_array(larger.b[:smaller.nO], smaller.b)

								            # self.model._layers[-1] = larger

								            raise ValueError(TempErrors.T003)

								        tag_map = dict(self.vocab.morphology.tag_map)

								        if values is None:

								            values = {POS: "X"}

								        tag_map[label] = values

								        self.vocab.morphology = Morphology(

								            self.vocab.strings, tag_map=tag_map,

								            lemmatizer=self.vocab.morphology.lemmatizer,

								            exc=self.vocab.morphology.exc)

								        return 1


								    def use_params(self, params):

								        with self.model.use_params(params):

								            yield


								    def to_bytes(self, **exclude):

								        serialize = OrderedDict()

								        if self.model in (None, True, False):

								            serialize['model'] = lambda: self.model

								        else:

								            serialize['model'] = self.model.to_bytes

								        serialize['vocab'] = self.vocab.to_bytes

								        serialize['cfg'] = lambda: ujson.dumps(self.cfg)

								        tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))

								        serialize['tag_map'] = lambda: msgpack.dumps(

								            tag_map, use_bin_type=True, encoding='utf8')

								        return util.to_bytes(serialize, exclude)


								    def from_bytes(self, bytes_data, **exclude):

								        def load_model(b):

								            # TODO: Remove this once we don't have to handle previous models

								            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:

								                self.cfg['pretrained_vectors'] = self.vocab.vectors.name


								            if self.model is True:

								                token_vector_width = util.env_opt(

								                    'token_vector_width',

								                    self.cfg.get('token_vector_width', 128))

								                self.model = self.Model(self.vocab.morphology.n_tags,

								                                        **self.cfg)

								            self.model.from_bytes(b)


								        def load_tag_map(b):

								            tag_map = msgpack.loads(b, encoding='utf8')

								            self.vocab.morphology = Morphology(

								                self.vocab.strings, tag_map=tag_map,

								                lemmatizer=self.vocab.morphology.lemmatizer,

								                exc=self.vocab.morphology.exc)


								        deserialize = OrderedDict((

								            ('vocab', lambda b: self.vocab.from_bytes(b)),

								            ('tag_map', load_tag_map),

								            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),

								            ('model', lambda b: load_model(b)),

								        ))

								        util.from_bytes(bytes_data, deserialize, exclude)

								        return self


								    def to_disk(self, path, **exclude):

								        tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))

								        serialize = OrderedDict((

								            ('vocab', lambda p: self.vocab.to_disk(p)),

								            ('tag_map', lambda p: p.open('wb').write(msgpack.dumps(

								                tag_map, use_bin_type=True, encoding='utf8'))),

								            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),

								            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))

								        ))

								        util.to_disk(path, serialize, exclude)


								    def from_disk(self, path, **exclude):

								        def load_model(p):

								            # TODO: Remove this once we don't have to handle previous models

								            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:

								                self.cfg['pretrained_vectors'] = self.vocab.vectors.name

								            if self.model is True:

								                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)

								            with p.open('rb') as file_:

								                self.model.from_bytes(file_.read())


								        def load_tag_map(p):

								            with p.open('rb') as file_:

								                tag_map = msgpack.loads(file_.read(), encoding='utf8')

								            self.vocab.morphology = Morphology(

								                self.vocab.strings, tag_map=tag_map,

								                lemmatizer=self.vocab.morphology.lemmatizer,

								                exc=self.vocab.morphology.exc)


								        deserialize = OrderedDict((

								            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),

								            ('vocab', lambda p: self.vocab.from_disk(p)),

								            ('tag_map', load_tag_map),

								            ('model', load_model),

								        ))

								        util.from_disk(path, deserialize, exclude)

								        return self


								class MultitaskObjective(Tagger):

								    """Experimental: Assist training of a parser or tagger, by training a

								    side-objective.

								    """

								    name = 'nn_labeller'


								    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):

								        self.vocab = vocab

								        self.model = model

								        if target == 'dep':

								            self.make_label = self.make_dep

								        elif target == 'tag':

								            self.make_label = self.make_tag

								        elif target == 'ent':

								            self.make_label = self.make_ent

								        elif target == 'dep_tag_offset':

								            self.make_label = self.make_dep_tag_offset

								        elif target == 'ent_tag':

								            self.make_label = self.make_ent_tag

								        elif hasattr(target, '__call__'):

								            self.make_label = target

								        else:

								            raise ValueError(Errors.E016)

								        self.cfg = dict(cfg)

								        self.cfg.setdefault('cnn_maxout_pieces', 2)


								    @property

								    def labels(self):

								        return self.cfg.setdefault('labels', {})


								    @labels.setter

								    def labels(self, value):

								        self.cfg['labels'] = value


								    def set_annotations(self, docs, dep_ids, tensors=None):

								        pass


								    def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None,

								                       sgd=None, **kwargs):

								        gold_tuples = nonproj.preprocess_training_data(gold_tuples)

								        for raw_text, annots_brackets in gold_tuples:

								            for annots, brackets in annots_brackets:

								                ids, words, tags, heads, deps, ents = annots

								                for i in range(len(ids)):

								                    label = self.make_label(i, words, tags, heads, deps, ents)

								                    if label is not None and label not in self.labels:

								                        self.labels[label] = len(self.labels)

								        if self.model is True:

								            token_vector_width = util.env_opt('token_vector_width')

								            self.model = self.Model(len(self.labels), tok2vec=tok2vec)

								        link_vectors_to_models(self.vocab)

								        if sgd is None:

								            sgd = self.create_optimizer()

								        return sgd


								    @classmethod

								    def Model(cls, n_tags, tok2vec=None, **cfg):

								        token_vector_width = util.env_opt('token_vector_width', 128)

								        softmax = Softmax(n_tags, token_vector_width)

								        model = chain(

								            tok2vec,

								            softmax

								        )

								        model.tok2vec = tok2vec

								        model.softmax = softmax

								        return model


								    def predict(self, docs):

								        tokvecs = self.model.tok2vec(docs)

								        scores = self.model.softmax(tokvecs)

								        return tokvecs, scores


								    def get_loss(self, docs, golds, scores):

								        if len(docs) != len(golds):

								            raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs),

								                                                n_golds=len(golds)))

								        cdef int idx = 0

								        correct = numpy.zeros((scores.shape[0],), dtype='i')

								        guesses = scores.argmax(axis=1)

								        for i, gold in enumerate(golds):

								            for j in range(len(docs[i])):

								                # Handes alignment for tokenization differences

								                gold_idx = gold.cand_to_gold[j]

								                if gold_idx is None:

								                    idx += 1

								                    continue

								                label = self.make_label(gold_idx, gold.words, gold.tags,

								                                        gold.heads, gold.labels, gold.ents)

								                if label is None or label not in self.labels:

								                    correct[idx] = guesses[idx]

								                else:

								                    correct[idx] = self.labels[label]

								                idx += 1

								        correct = self.model.ops.xp.array(correct, dtype='i')

								        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])

								        d_scores /= d_scores.shape[0]

								        loss = (d_scores**2).sum()

								        return float(loss), d_scores


								    @staticmethod

								    def make_dep(i, words, tags, heads, deps, ents):

								        if deps[i] is None or heads[i] is None:

								            return None

								        return deps[i]


								    @staticmethod

								    def make_tag(i, words, tags, heads, deps, ents):

								        return tags[i]


								    @staticmethod

								    def make_ent(i, words, tags, heads, deps, ents):

								        if ents is None:

								            return None

								        return ents[i]


								    @staticmethod

								    def make_dep_tag_offset(i, words, tags, heads, deps, ents):

								        if deps[i] is None or heads[i] is None:

								            return None

								        offset = heads[i] - i

								        offset = min(offset, 2)

								        offset = max(offset, -2)

								        return '%s-%s:%d' % (deps[i], tags[i], offset)


								    @staticmethod

								    def make_ent_tag(i, words, tags, heads, deps, ents):

								        if ents is None or ents[i] is None:

								            return None

								        else:

								            return '%s-%s' % (tags[i], ents[i])


								class SimilarityHook(Pipe):

								    """

								    Experimental: A pipeline component to install a hook for supervised

								    similarity into `Doc` objects. Requires a `Tensorizer` to pre-process

								    documents. The similarity model can be any object obeying the Thinc `Model`

								    interface. By default, the model concatenates the elementwise mean and

								    elementwise max of the two tensors, and compares them using the

								    Cauchy-like similarity function from Chen (2013):


								        >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())


								    Where W is a vector of dimension weights, initialized to 1.

								    """

								    name = 'similarity'


								    def __init__(self, vocab, model=True, **cfg):

								        self.vocab = vocab

								        self.model = model

								        self.cfg = dict(cfg)


								    @classmethod

								    def Model(cls, length):

								        return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))


								    def __call__(self, doc):

								        """Install similarity hook"""

								        doc.user_hooks['similarity'] = self.predict

								        return doc


								    def pipe(self, docs, **kwargs):

								        for doc in docs:

								            yield self(doc)


								    def predict(self, doc1, doc2):

								        return self.model.predict([(doc1, doc2)])


								    def update(self, doc1_doc2, golds, sgd=None, drop=0.):

								        sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)


								    def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):

								        """Allocate model, using width from tensorizer in pipeline.


								        gold_tuples (iterable): Gold-standard training data.

								        pipeline (list): The pipeline the model is part of.

								        """

								        if self.model is True:

								            self.model = self.Model(pipeline[0].model.nO)

								            link_vectors_to_models(self.vocab)

								        if sgd is None:

								            sgd = self.create_optimizer()

								        return sgd


								class TextCategorizer(Pipe):

								    name = 'textcat'


								    @classmethod

								    def Model(cls, nr_class=1, width=64, **cfg):

								        return build_text_classifier(nr_class, width, **cfg)


								    def __init__(self, vocab, model=True, **cfg):

								        self.vocab = vocab

								        self.model = model

								        self.cfg = dict(cfg)


								    @property

								    def labels(self):

								        return self.cfg.setdefault('labels', [])


								    @labels.setter

								    def labels(self, value):

								        self.cfg['labels'] = value


								    def __call__(self, doc):

								        scores, tensors = self.predict([doc])

								        self.set_annotations([doc], scores, tensors=tensors)

								        return doc


								    def pipe(self, stream, batch_size=128, n_threads=-1):

								        for docs in cytoolz.partition_all(batch_size, stream):

								            docs = list(docs)

								            scores, tensors = self.predict(docs)

								            self.set_annotations(docs, scores, tensors=tensors)

								            yield from docs


								    def predict(self, docs):

								        scores = self.model(docs)

								        scores = self.model.ops.asarray(scores)

								        tensors = [doc.tensor for doc in docs]

								        return scores, tensors


								    def set_annotations(self, docs, scores, tensors=None):

								        for i, doc in enumerate(docs):

								            for j, label in enumerate(self.labels):

								                doc.cats[label] = float(scores[i, j])


								    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):

								        scores, bp_scores = self.model.begin_update(docs, drop=drop)

								        loss, d_scores = self.get_loss(docs, golds, scores)

								        bp_scores(d_scores, sgd=sgd)

								        if losses is not None:

								            losses.setdefault(self.name, 0.0)

								            losses[self.name] += loss


								    def get_loss(self, docs, golds, scores):

								        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')

								        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')

								        for i, gold in enumerate(golds):

								            for j, label in enumerate(self.labels):

								                if label in gold.cats:

								                    truths[i, j] = gold.cats[label]

								                else:

								                    not_missing[i, j] = 0.

								        truths = self.model.ops.asarray(truths)

								        not_missing = self.model.ops.asarray(not_missing)

								        d_scores = (scores-truths) / scores.shape[0]

								        d_scores *= not_missing

								        mean_square_error = ((scores-truths)**2).sum(axis=1).mean()

								        return mean_square_error, d_scores


								    def add_label(self, label):

								        if label in self.labels:

								            return 0

								        if self.model not in (None, True, False):

								            smaller = self.model._layers[-1]

								            larger = Affine(len(self.labels)+1, smaller.nI)

								            copy_array(larger.W[:smaller.nO], smaller.W)

								            copy_array(larger.b[:smaller.nO], smaller.b)

								            self.model._layers[-1] = larger

								        self.labels.append(label)

								        return 1


								    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,

								                       **kwargs):

								        if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':

								            token_vector_width = pipeline[0].model.nO

								        else:

								            token_vector_width = 64


								        if self.model is True:

								            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')

								            self.model = self.Model(len(self.labels), token_vector_width,

								                                    **self.cfg)

								            link_vectors_to_models(self.vocab)

								        if sgd is None:

								            sgd = self.create_optimizer()

								        return sgd


								cdef class DependencyParser(Parser):

								    name = 'parser'

								    TransitionSystem = ArcEager


								    @property

								    def postprocesses(self):

								        return [nonproj.deprojectivize]


								    def add_multitask_objective(self, target):

								        labeller = MultitaskObjective(self.vocab, target=target)

								        self._multitasks.append(labeller)


								    def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):

								        for labeller in self._multitasks:

								            tok2vec = self.model[0]

								            labeller.begin_training(gold_tuples, pipeline=pipeline,

								                                    tok2vec=tok2vec, sgd=sgd)


								    def __reduce__(self):

								        return (DependencyParser, (self.vocab, self.moves, self.model),

								                None, None)


								cdef class EntityRecognizer(Parser):

								    name = 'ner'

								    TransitionSystem = BiluoPushDown


								    nr_feature = 6


								    def add_multitask_objective(self, target):

								        labeller = MultitaskObjective(self.vocab, target=target)

								        self._multitasks.append(labeller)


								    def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):

								        for labeller in self._multitasks:

								            tok2vec = self.model[0]

								            labeller.begin_training(gold_tuples, pipeline=pipeline,

								                                    tok2vec=tok2vec)


								    def __reduce__(self):

								        return (EntityRecognizer, (self.vocab, self.moves, self.model),

								                None, None)


								__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']