You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

972 lines
35 KiB

4 years ago
  1. # cython: infer_types=True
  2. # cython: profile=True
  3. # coding: utf8
  4. from __future__ import unicode_literals
  5. import numpy
  6. cimport numpy as np
  7. import cytoolz
  8. from collections import OrderedDict
  9. import ujson
  10. from .util import msgpack
  11. from .util import msgpack_numpy
  12. from thinc.api import chain
  13. from thinc.v2v import Affine, SELU, Softmax
  14. from thinc.t2v import Pooling, max_pool, mean_pool
  15. from thinc.neural.util import to_categorical, copy_array
  16. from thinc.neural._classes.difference import Siamese, CauchySimilarity
  17. from .tokens.doc cimport Doc
  18. from .syntax.nn_parser cimport Parser
  19. from .syntax import nonproj
  20. from .syntax.ner cimport BiluoPushDown
  21. from .syntax.arc_eager cimport ArcEager
  22. from .morphology cimport Morphology
  23. from .vocab cimport Vocab
  24. from .syntax import nonproj
  25. from .compat import json_dumps
  26. from .attrs import POS
  27. from .parts_of_speech import X
  28. from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
  29. from ._ml import link_vectors_to_models, zero_init, flatten
  30. from ._ml import create_default_optimizer
  31. from .errors import Errors, TempErrors
  32. from . import util
  33. class SentenceSegmenter(object):
  34. """A simple spaCy hook, to allow custom sentence boundary detection logic
  35. (that doesn't require the dependency parse). To change the sentence
  36. boundary detection strategy, pass a generator function `strategy` on
  37. initialization, or assign a new strategy to the .strategy attribute.
  38. Sentence detection strategies should be generators that take `Doc` objects
  39. and yield `Span` objects for each sentence.
  40. """
  41. name = 'sbd'
  42. def __init__(self, vocab, strategy=None):
  43. self.vocab = vocab
  44. if strategy is None or strategy == 'on_punct':
  45. strategy = self.split_on_punct
  46. self.strategy = strategy
  47. def __call__(self, doc):
  48. doc.user_hooks['sents'] = self.strategy
  49. return doc
  50. @staticmethod
  51. def split_on_punct(doc):
  52. start = 0
  53. seen_period = False
  54. for i, word in enumerate(doc):
  55. if seen_period and not word.is_punct:
  56. yield doc[start:word.i]
  57. start = word.i
  58. seen_period = False
  59. elif word.text in ['.', '!', '?']:
  60. seen_period = True
  61. if start < len(doc):
  62. yield doc[start:len(doc)]
  63. def merge_noun_chunks(doc):
  64. """Merge noun chunks into a single token.
  65. doc (Doc): The Doc object.
  66. RETURNS (Doc): The Doc object with merged noun chunks.
  67. """
  68. if not doc.is_parsed:
  69. return doc
  70. spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)
  71. for np in doc.noun_chunks]
  72. for start, end, tag, dep in spans:
  73. doc.merge(start, end, tag=tag, dep=dep)
  74. return doc
  75. def merge_entities(doc):
  76. """Merge entities into a single token.
  77. doc (Doc): The Doc object.
  78. RETURNS (Doc): The Doc object with merged noun entities.
  79. """
  80. spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
  81. for e in doc.ents]
  82. for start, end, tag, dep, ent_type in spans:
  83. doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
  84. return doc
  85. class Pipe(object):
  86. """This class is not instantiated directly. Components inherit from it, and
  87. it defines the interface that components should follow to function as
  88. components in a spaCy analysis pipeline.
  89. """
  90. name = None
  91. @classmethod
  92. def Model(cls, *shape, **kwargs):
  93. """Initialize a model for the pipe."""
  94. raise NotImplementedError
  95. def __init__(self, vocab, model=True, **cfg):
  96. """Create a new pipe instance."""
  97. raise NotImplementedError
  98. def __call__(self, doc):
  99. """Apply the pipe to one document. The document is
  100. modified in-place, and returned.
  101. Both __call__ and pipe should delegate to the `predict()`
  102. and `set_annotations()` methods.
  103. """
  104. scores, tensors = self.predict([doc])
  105. self.set_annotations([doc], scores, tensors=tensors)
  106. return doc
  107. def pipe(self, stream, batch_size=128, n_threads=-1):
  108. """Apply the pipe to a stream of documents.
  109. Both __call__ and pipe should delegate to the `predict()`
  110. and `set_annotations()` methods.
  111. """
  112. for docs in cytoolz.partition_all(batch_size, stream):
  113. docs = list(docs)
  114. scores, tensors = self.predict(docs)
  115. self.set_annotations(docs, scores, tensor=tensors)
  116. yield from docs
  117. def predict(self, docs):
  118. """Apply the pipeline's model to a batch of docs, without
  119. modifying them.
  120. """
  121. raise NotImplementedError
  122. def set_annotations(self, docs, scores, tensors=None):
  123. """Modify a batch of documents, using pre-computed scores."""
  124. raise NotImplementedError
  125. def update(self, docs, golds, drop=0., sgd=None, losses=None):
  126. """Learn from a batch of documents and gold-standard information,
  127. updating the pipe's model.
  128. Delegates to predict() and get_loss().
  129. """
  130. raise NotImplementedError
  131. def get_loss(self, docs, golds, scores):
  132. """Find the loss and gradient of loss for the batch of
  133. documents and their predicted scores."""
  134. raise NotImplementedError
  135. def add_label(self, label):
  136. """Add an output label, to be predicted by the model.
  137. It's possible to extend pre-trained models with new labels,
  138. but care should be taken to avoid the "catastrophic forgetting"
  139. problem.
  140. """
  141. raise NotImplementedError
  142. def create_optimizer(self):
  143. return create_default_optimizer(self.model.ops,
  144. **self.cfg.get('optimizer', {}))
  145. def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
  146. **kwargs):
  147. """Initialize the pipe for training, using data exampes if available.
  148. If no model has been initialized yet, the model is added."""
  149. if self.model is True:
  150. self.model = self.Model(**self.cfg)
  151. link_vectors_to_models(self.vocab)
  152. if sgd is None:
  153. sgd = self.create_optimizer()
  154. return sgd
  155. def use_params(self, params):
  156. """Modify the pipe's model, to use the given parameter values."""
  157. with self.model.use_params(params):
  158. yield
  159. def to_bytes(self, **exclude):
  160. """Serialize the pipe to a bytestring."""
  161. serialize = OrderedDict()
  162. serialize['cfg'] = lambda: json_dumps(self.cfg)
  163. if self.model in (True, False, None):
  164. serialize['model'] = lambda: self.model
  165. else:
  166. serialize['model'] = self.model.to_bytes
  167. serialize['vocab'] = self.vocab.to_bytes
  168. return util.to_bytes(serialize, exclude)
  169. def from_bytes(self, bytes_data, **exclude):
  170. """Load the pipe from a bytestring."""
  171. def load_model(b):
  172. # TODO: Remove this once we don't have to handle previous models
  173. if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
  174. self.cfg['pretrained_vectors'] = self.vocab.vectors.name
  175. if self.model is True:
  176. self.model = self.Model(**self.cfg)
  177. self.model.from_bytes(b)
  178. deserialize = OrderedDict((
  179. ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
  180. ('vocab', lambda b: self.vocab.from_bytes(b)),
  181. ('model', load_model),
  182. ))
  183. util.from_bytes(bytes_data, deserialize, exclude)
  184. return self
  185. def to_disk(self, path, **exclude):
  186. """Serialize the pipe to disk."""
  187. serialize = OrderedDict()
  188. serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
  189. serialize['vocab'] = lambda p: self.vocab.to_disk(p)
  190. if self.model not in (None, True, False):
  191. serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
  192. util.to_disk(path, serialize, exclude)
  193. def from_disk(self, path, **exclude):
  194. """Load the pipe from disk."""
  195. def load_model(p):
  196. # TODO: Remove this once we don't have to handle previous models
  197. if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
  198. self.cfg['pretrained_vectors'] = self.vocab.vectors.name
  199. if self.model is True:
  200. self.model = self.Model(**self.cfg)
  201. self.model.from_bytes(p.open('rb').read())
  202. deserialize = OrderedDict((
  203. ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
  204. ('vocab', lambda p: self.vocab.from_disk(p)),
  205. ('model', load_model),
  206. ))
  207. util.from_disk(path, deserialize, exclude)
  208. return self
  209. def _load_cfg(path):
  210. if path.exists():
  211. with path.open() as file_:
  212. return ujson.load(file_)
  213. else:
  214. return {}
  215. class Tensorizer(Pipe):
  216. """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
  217. name = 'tensorizer'
  218. @classmethod
  219. def Model(cls, output_size=300, input_size=384, **cfg):
  220. """Create a new statistical model for the class.
  221. width (int): Output size of the model.
  222. embed_size (int): Number of vectors in the embedding table.
  223. **cfg: Config parameters.
  224. RETURNS (Model): A `thinc.neural.Model` or similar instance.
  225. """
  226. model = chain(
  227. SELU(output_size, input_size),
  228. SELU(output_size, output_size),
  229. zero_init(Affine(output_size, output_size)))
  230. return model
  231. def __init__(self, vocab, model=True, **cfg):
  232. """Construct a new statistical model. Weights are not allocated on
  233. initialisation.
  234. vocab (Vocab): A `Vocab` instance. The model must share the same
  235. `Vocab` instance with the `Doc` objects it will process.
  236. model (Model): A `Model` instance or `True` allocate one later.
  237. **cfg: Config parameters.
  238. EXAMPLE:
  239. >>> from spacy.pipeline import TokenVectorEncoder
  240. >>> tok2vec = TokenVectorEncoder(nlp.vocab)
  241. >>> tok2vec.model = tok2vec.Model(128, 5000)
  242. """
  243. self.vocab = vocab
  244. self.model = model
  245. self.input_models = []
  246. self.cfg = dict(cfg)
  247. self.cfg.setdefault('cnn_maxout_pieces', 3)
  248. def __call__(self, doc):
  249. """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
  250. model. Vectors are set to the `Doc.tensor` attribute.
  251. docs (Doc or iterable): One or more documents to add vectors to.
  252. RETURNS (dict or None): Intermediate computations.
  253. """
  254. tokvecses = self.predict([doc])
  255. self.set_annotations([doc], tokvecses)
  256. return doc
  257. def pipe(self, stream, batch_size=128, n_threads=-1):
  258. """Process `Doc` objects as a stream.
  259. stream (iterator): A sequence of `Doc` objects to process.
  260. batch_size (int): Number of `Doc` objects to group.
  261. n_threads (int): Number of threads.
  262. YIELDS (iterator): A sequence of `Doc` objects, in order of input.
  263. """
  264. for docs in cytoolz.partition_all(batch_size, stream):
  265. docs = list(docs)
  266. tensors = self.predict(docs)
  267. self.set_annotations(docs, tensors)
  268. yield from docs
  269. def predict(self, docs):
  270. """Return a single tensor for a batch of documents.
  271. docs (iterable): A sequence of `Doc` objects.
  272. RETURNS (object): Vector representations for each token in the docs.
  273. """
  274. inputs = self.model.ops.flatten([doc.tensor for doc in docs])
  275. outputs = self.model(inputs)
  276. return self.model.ops.unflatten(outputs, [len(d) for d in docs])
  277. def set_annotations(self, docs, tensors):
  278. """Set the tensor attribute for a batch of documents.
  279. docs (iterable): A sequence of `Doc` objects.
  280. tensors (object): Vector representation for each token in the docs.
  281. """
  282. for doc, tensor in zip(docs, tensors):
  283. if tensor.shape[0] != len(doc):
  284. raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
  285. doc.tensor = tensor
  286. def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
  287. """Update the model.
  288. docs (iterable): A batch of `Doc` objects.
  289. golds (iterable): A batch of `GoldParse` objects.
  290. drop (float): The droput rate.
  291. sgd (callable): An optimizer.
  292. RETURNS (dict): Results from the update.
  293. """
  294. if isinstance(docs, Doc):
  295. docs = [docs]
  296. inputs = []
  297. bp_inputs = []
  298. for tok2vec in self.input_models:
  299. tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
  300. inputs.append(tensor)
  301. bp_inputs.append(bp_tensor)
  302. inputs = self.model.ops.xp.hstack(inputs)
  303. scores, bp_scores = self.model.begin_update(inputs, drop=drop)
  304. loss, d_scores = self.get_loss(docs, golds, scores)
  305. d_inputs = bp_scores(d_scores, sgd=sgd)
  306. d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
  307. for d_input, bp_input in zip(d_inputs, bp_inputs):
  308. bp_input(d_input, sgd=sgd)
  309. if losses is not None:
  310. losses.setdefault(self.name, 0.)
  311. losses[self.name] += loss
  312. return loss
  313. def get_loss(self, docs, golds, prediction):
  314. target = []
  315. i = 0
  316. for doc in docs:
  317. vectors = self.model.ops.xp.vstack([w.vector for w in doc])
  318. target.append(vectors)
  319. target = self.model.ops.xp.vstack(target)
  320. d_scores = (prediction - target) / prediction.shape[0]
  321. loss = (d_scores**2).sum()
  322. return loss, d_scores
  323. def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
  324. **kwargs):
  325. """Allocate models, pre-process training data and acquire an
  326. optimizer.
  327. gold_tuples (iterable): Gold-standard training data.
  328. pipeline (list): The pipeline the model is part of.
  329. """
  330. for name, model in pipeline:
  331. if getattr(model, 'tok2vec', None):
  332. self.input_models.append(model.tok2vec)
  333. if self.model is True:
  334. self.cfg['input_size'] = 384
  335. self.cfg['output_size'] = 300
  336. self.model = self.Model(**self.cfg)
  337. link_vectors_to_models(self.vocab)
  338. if sgd is None:
  339. sgd = self.create_optimizer()
  340. return sgd
  341. class Tagger(Pipe):
  342. name = 'tagger'
  343. def __init__(self, vocab, model=True, **cfg):
  344. self.vocab = vocab
  345. self.model = model
  346. self.cfg = OrderedDict(sorted(cfg.items()))
  347. self.cfg.setdefault('cnn_maxout_pieces', 2)
  348. @property
  349. def labels(self):
  350. return self.vocab.morphology.tag_names
  351. @property
  352. def tok2vec(self):
  353. if self.model in (None, True, False):
  354. return None
  355. else:
  356. return chain(self.model.tok2vec, flatten)
  357. def __call__(self, doc):
  358. tags, tokvecs = self.predict([doc])
  359. self.set_annotations([doc], tags, tensors=tokvecs)
  360. return doc
  361. def pipe(self, stream, batch_size=128, n_threads=-1):
  362. for docs in cytoolz.partition_all(batch_size, stream):
  363. docs = list(docs)
  364. tag_ids, tokvecs = self.predict(docs)
  365. self.set_annotations(docs, tag_ids, tensors=tokvecs)
  366. yield from docs
  367. def predict(self, docs):
  368. tokvecs = self.model.tok2vec(docs)
  369. scores = self.model.softmax(tokvecs)
  370. guesses = []
  371. for doc_scores in scores:
  372. doc_guesses = doc_scores.argmax(axis=1)
  373. if not isinstance(doc_guesses, numpy.ndarray):
  374. doc_guesses = doc_guesses.get()
  375. guesses.append(doc_guesses)
  376. return guesses, tokvecs
  377. def set_annotations(self, docs, batch_tag_ids, tensors=None):
  378. if isinstance(docs, Doc):
  379. docs = [docs]
  380. cdef Doc doc
  381. cdef int idx = 0
  382. cdef Vocab vocab = self.vocab
  383. for i, doc in enumerate(docs):
  384. doc_tag_ids = batch_tag_ids[i]
  385. if hasattr(doc_tag_ids, 'get'):
  386. doc_tag_ids = doc_tag_ids.get()
  387. for j, tag_id in enumerate(doc_tag_ids):
  388. # Don't clobber preset POS tags
  389. if doc.c[j].tag == 0 and doc.c[j].pos == 0:
  390. # Don't clobber preset lemmas
  391. lemma = doc.c[j].lemma
  392. vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
  393. if lemma != 0 and lemma != doc.c[j].lex.orth:
  394. doc.c[j].lemma = lemma
  395. idx += 1
  396. if tensors is not None:
  397. if isinstance(doc.tensor, numpy.ndarray) \
  398. and not isinstance(tensors[i], numpy.ndarray):
  399. doc.extend_tensor(tensors[i].get())
  400. else:
  401. doc.extend_tensor(tensors[i])
  402. doc.is_tagged = True
  403. def update(self, docs, golds, drop=0., sgd=None, losses=None):
  404. if losses is not None and self.name not in losses:
  405. losses[self.name] = 0.
  406. tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
  407. loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
  408. bp_tag_scores(d_tag_scores, sgd=sgd)
  409. if losses is not None:
  410. losses[self.name] += loss
  411. def get_loss(self, docs, golds, scores):
  412. scores = self.model.ops.flatten(scores)
  413. tag_index = {tag: i for i, tag in enumerate(self.labels)}
  414. cdef int idx = 0
  415. correct = numpy.zeros((scores.shape[0],), dtype='i')
  416. guesses = scores.argmax(axis=1)
  417. for gold in golds:
  418. for tag in gold.tags:
  419. if tag is None:
  420. correct[idx] = guesses[idx]
  421. else:
  422. correct[idx] = tag_index[tag]
  423. idx += 1
  424. correct = self.model.ops.xp.array(correct, dtype='i')
  425. d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
  426. d_scores /= d_scores.shape[0]
  427. loss = (d_scores**2).sum()
  428. d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
  429. return float(loss), d_scores
  430. def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
  431. **kwargs):
  432. orig_tag_map = dict(self.vocab.morphology.tag_map)
  433. new_tag_map = OrderedDict()
  434. for raw_text, annots_brackets in gold_tuples:
  435. for annots, brackets in annots_brackets:
  436. ids, words, tags, heads, deps, ents = annots
  437. for tag in tags:
  438. if tag in orig_tag_map:
  439. new_tag_map[tag] = orig_tag_map[tag]
  440. else:
  441. new_tag_map[tag] = {POS: X}
  442. cdef Vocab vocab = self.vocab
  443. if new_tag_map:
  444. vocab.morphology = Morphology(vocab.strings, new_tag_map,
  445. vocab.morphology.lemmatizer,
  446. exc=vocab.morphology.exc)
  447. self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
  448. if self.model is True:
  449. self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
  450. link_vectors_to_models(self.vocab)
  451. if sgd is None:
  452. sgd = self.create_optimizer()
  453. return sgd
  454. @classmethod
  455. def Model(cls, n_tags, **cfg):
  456. if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
  457. raise ValueError(TempErrors.T008)
  458. return build_tagger_model(n_tags, **cfg)
  459. def add_label(self, label, values=None):
  460. if label in self.labels:
  461. return 0
  462. if self.model not in (True, False, None):
  463. # Here's how the model resizing will work, once the
  464. # neuron-to-tag mapping is no longer controlled by
  465. # the Morphology class, which sorts the tag names.
  466. # The sorting makes adding labels difficult.
  467. # smaller = self.model._layers[-1]
  468. # larger = Softmax(len(self.labels)+1, smaller.nI)
  469. # copy_array(larger.W[:smaller.nO], smaller.W)
  470. # copy_array(larger.b[:smaller.nO], smaller.b)
  471. # self.model._layers[-1] = larger
  472. raise ValueError(TempErrors.T003)
  473. tag_map = dict(self.vocab.morphology.tag_map)
  474. if values is None:
  475. values = {POS: "X"}
  476. tag_map[label] = values
  477. self.vocab.morphology = Morphology(
  478. self.vocab.strings, tag_map=tag_map,
  479. lemmatizer=self.vocab.morphology.lemmatizer,
  480. exc=self.vocab.morphology.exc)
  481. return 1
  482. def use_params(self, params):
  483. with self.model.use_params(params):
  484. yield
  485. def to_bytes(self, **exclude):
  486. serialize = OrderedDict()
  487. if self.model in (None, True, False):
  488. serialize['model'] = lambda: self.model
  489. else:
  490. serialize['model'] = self.model.to_bytes
  491. serialize['vocab'] = self.vocab.to_bytes
  492. serialize['cfg'] = lambda: ujson.dumps(self.cfg)
  493. tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
  494. serialize['tag_map'] = lambda: msgpack.dumps(
  495. tag_map, use_bin_type=True, encoding='utf8')
  496. return util.to_bytes(serialize, exclude)
  497. def from_bytes(self, bytes_data, **exclude):
  498. def load_model(b):
  499. # TODO: Remove this once we don't have to handle previous models
  500. if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
  501. self.cfg['pretrained_vectors'] = self.vocab.vectors.name
  502. if self.model is True:
  503. token_vector_width = util.env_opt(
  504. 'token_vector_width',
  505. self.cfg.get('token_vector_width', 128))
  506. self.model = self.Model(self.vocab.morphology.n_tags,
  507. **self.cfg)
  508. self.model.from_bytes(b)
  509. def load_tag_map(b):
  510. tag_map = msgpack.loads(b, encoding='utf8')
  511. self.vocab.morphology = Morphology(
  512. self.vocab.strings, tag_map=tag_map,
  513. lemmatizer=self.vocab.morphology.lemmatizer,
  514. exc=self.vocab.morphology.exc)
  515. deserialize = OrderedDict((
  516. ('vocab', lambda b: self.vocab.from_bytes(b)),
  517. ('tag_map', load_tag_map),
  518. ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
  519. ('model', lambda b: load_model(b)),
  520. ))
  521. util.from_bytes(bytes_data, deserialize, exclude)
  522. return self
  523. def to_disk(self, path, **exclude):
  524. tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
  525. serialize = OrderedDict((
  526. ('vocab', lambda p: self.vocab.to_disk(p)),
  527. ('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
  528. tag_map, use_bin_type=True, encoding='utf8'))),
  529. ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
  530. ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
  531. ))
  532. util.to_disk(path, serialize, exclude)
  533. def from_disk(self, path, **exclude):
  534. def load_model(p):
  535. # TODO: Remove this once we don't have to handle previous models
  536. if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
  537. self.cfg['pretrained_vectors'] = self.vocab.vectors.name
  538. if self.model is True:
  539. self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
  540. with p.open('rb') as file_:
  541. self.model.from_bytes(file_.read())
  542. def load_tag_map(p):
  543. with p.open('rb') as file_:
  544. tag_map = msgpack.loads(file_.read(), encoding='utf8')
  545. self.vocab.morphology = Morphology(
  546. self.vocab.strings, tag_map=tag_map,
  547. lemmatizer=self.vocab.morphology.lemmatizer,
  548. exc=self.vocab.morphology.exc)
  549. deserialize = OrderedDict((
  550. ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
  551. ('vocab', lambda p: self.vocab.from_disk(p)),
  552. ('tag_map', load_tag_map),
  553. ('model', load_model),
  554. ))
  555. util.from_disk(path, deserialize, exclude)
  556. return self
  557. class MultitaskObjective(Tagger):
  558. """Experimental: Assist training of a parser or tagger, by training a
  559. side-objective.
  560. """
  561. name = 'nn_labeller'
  562. def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
  563. self.vocab = vocab
  564. self.model = model
  565. if target == 'dep':
  566. self.make_label = self.make_dep
  567. elif target == 'tag':
  568. self.make_label = self.make_tag
  569. elif target == 'ent':
  570. self.make_label = self.make_ent
  571. elif target == 'dep_tag_offset':
  572. self.make_label = self.make_dep_tag_offset
  573. elif target == 'ent_tag':
  574. self.make_label = self.make_ent_tag
  575. elif hasattr(target, '__call__'):
  576. self.make_label = target
  577. else:
  578. raise ValueError(Errors.E016)
  579. self.cfg = dict(cfg)
  580. self.cfg.setdefault('cnn_maxout_pieces', 2)
  581. @property
  582. def labels(self):
  583. return self.cfg.setdefault('labels', {})
  584. @labels.setter
  585. def labels(self, value):
  586. self.cfg['labels'] = value
  587. def set_annotations(self, docs, dep_ids, tensors=None):
  588. pass
  589. def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None,
  590. sgd=None, **kwargs):
  591. gold_tuples = nonproj.preprocess_training_data(gold_tuples)
  592. for raw_text, annots_brackets in gold_tuples:
  593. for annots, brackets in annots_brackets:
  594. ids, words, tags, heads, deps, ents = annots
  595. for i in range(len(ids)):
  596. label = self.make_label(i, words, tags, heads, deps, ents)
  597. if label is not None and label not in self.labels:
  598. self.labels[label] = len(self.labels)
  599. if self.model is True:
  600. token_vector_width = util.env_opt('token_vector_width')
  601. self.model = self.Model(len(self.labels), tok2vec=tok2vec)
  602. link_vectors_to_models(self.vocab)
  603. if sgd is None:
  604. sgd = self.create_optimizer()
  605. return sgd
  606. @classmethod
  607. def Model(cls, n_tags, tok2vec=None, **cfg):
  608. token_vector_width = util.env_opt('token_vector_width', 128)
  609. softmax = Softmax(n_tags, token_vector_width)
  610. model = chain(
  611. tok2vec,
  612. softmax
  613. )
  614. model.tok2vec = tok2vec
  615. model.softmax = softmax
  616. return model
  617. def predict(self, docs):
  618. tokvecs = self.model.tok2vec(docs)
  619. scores = self.model.softmax(tokvecs)
  620. return tokvecs, scores
  621. def get_loss(self, docs, golds, scores):
  622. if len(docs) != len(golds):
  623. raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs),
  624. n_golds=len(golds)))
  625. cdef int idx = 0
  626. correct = numpy.zeros((scores.shape[0],), dtype='i')
  627. guesses = scores.argmax(axis=1)
  628. for i, gold in enumerate(golds):
  629. for j in range(len(docs[i])):
  630. # Handes alignment for tokenization differences
  631. gold_idx = gold.cand_to_gold[j]
  632. if gold_idx is None:
  633. idx += 1
  634. continue
  635. label = self.make_label(gold_idx, gold.words, gold.tags,
  636. gold.heads, gold.labels, gold.ents)
  637. if label is None or label not in self.labels:
  638. correct[idx] = guesses[idx]
  639. else:
  640. correct[idx] = self.labels[label]
  641. idx += 1
  642. correct = self.model.ops.xp.array(correct, dtype='i')
  643. d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
  644. d_scores /= d_scores.shape[0]
  645. loss = (d_scores**2).sum()
  646. return float(loss), d_scores
  647. @staticmethod
  648. def make_dep(i, words, tags, heads, deps, ents):
  649. if deps[i] is None or heads[i] is None:
  650. return None
  651. return deps[i]
  652. @staticmethod
  653. def make_tag(i, words, tags, heads, deps, ents):
  654. return tags[i]
  655. @staticmethod
  656. def make_ent(i, words, tags, heads, deps, ents):
  657. if ents is None:
  658. return None
  659. return ents[i]
  660. @staticmethod
  661. def make_dep_tag_offset(i, words, tags, heads, deps, ents):
  662. if deps[i] is None or heads[i] is None:
  663. return None
  664. offset = heads[i] - i
  665. offset = min(offset, 2)
  666. offset = max(offset, -2)
  667. return '%s-%s:%d' % (deps[i], tags[i], offset)
  668. @staticmethod
  669. def make_ent_tag(i, words, tags, heads, deps, ents):
  670. if ents is None or ents[i] is None:
  671. return None
  672. else:
  673. return '%s-%s' % (tags[i], ents[i])
  674. class SimilarityHook(Pipe):
  675. """
  676. Experimental: A pipeline component to install a hook for supervised
  677. similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
  678. documents. The similarity model can be any object obeying the Thinc `Model`
  679. interface. By default, the model concatenates the elementwise mean and
  680. elementwise max of the two tensors, and compares them using the
  681. Cauchy-like similarity function from Chen (2013):
  682. >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
  683. Where W is a vector of dimension weights, initialized to 1.
  684. """
  685. name = 'similarity'
  686. def __init__(self, vocab, model=True, **cfg):
  687. self.vocab = vocab
  688. self.model = model
  689. self.cfg = dict(cfg)
  690. @classmethod
  691. def Model(cls, length):
  692. return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
  693. def __call__(self, doc):
  694. """Install similarity hook"""
  695. doc.user_hooks['similarity'] = self.predict
  696. return doc
  697. def pipe(self, docs, **kwargs):
  698. for doc in docs:
  699. yield self(doc)
  700. def predict(self, doc1, doc2):
  701. return self.model.predict([(doc1, doc2)])
  702. def update(self, doc1_doc2, golds, sgd=None, drop=0.):
  703. sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
  704. def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
  705. """Allocate model, using width from tensorizer in pipeline.
  706. gold_tuples (iterable): Gold-standard training data.
  707. pipeline (list): The pipeline the model is part of.
  708. """
  709. if self.model is True:
  710. self.model = self.Model(pipeline[0].model.nO)
  711. link_vectors_to_models(self.vocab)
  712. if sgd is None:
  713. sgd = self.create_optimizer()
  714. return sgd
  715. class TextCategorizer(Pipe):
  716. name = 'textcat'
  717. @classmethod
  718. def Model(cls, nr_class=1, width=64, **cfg):
  719. return build_text_classifier(nr_class, width, **cfg)
  720. def __init__(self, vocab, model=True, **cfg):
  721. self.vocab = vocab
  722. self.model = model
  723. self.cfg = dict(cfg)
  724. @property
  725. def labels(self):
  726. return self.cfg.setdefault('labels', [])
  727. @labels.setter
  728. def labels(self, value):
  729. self.cfg['labels'] = value
  730. def __call__(self, doc):
  731. scores, tensors = self.predict([doc])
  732. self.set_annotations([doc], scores, tensors=tensors)
  733. return doc
  734. def pipe(self, stream, batch_size=128, n_threads=-1):
  735. for docs in cytoolz.partition_all(batch_size, stream):
  736. docs = list(docs)
  737. scores, tensors = self.predict(docs)
  738. self.set_annotations(docs, scores, tensors=tensors)
  739. yield from docs
  740. def predict(self, docs):
  741. scores = self.model(docs)
  742. scores = self.model.ops.asarray(scores)
  743. tensors = [doc.tensor for doc in docs]
  744. return scores, tensors
  745. def set_annotations(self, docs, scores, tensors=None):
  746. for i, doc in enumerate(docs):
  747. for j, label in enumerate(self.labels):
  748. doc.cats[label] = float(scores[i, j])
  749. def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
  750. scores, bp_scores = self.model.begin_update(docs, drop=drop)
  751. loss, d_scores = self.get_loss(docs, golds, scores)
  752. bp_scores(d_scores, sgd=sgd)
  753. if losses is not None:
  754. losses.setdefault(self.name, 0.0)
  755. losses[self.name] += loss
  756. def get_loss(self, docs, golds, scores):
  757. truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
  758. not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
  759. for i, gold in enumerate(golds):
  760. for j, label in enumerate(self.labels):
  761. if label in gold.cats:
  762. truths[i, j] = gold.cats[label]
  763. else:
  764. not_missing[i, j] = 0.
  765. truths = self.model.ops.asarray(truths)
  766. not_missing = self.model.ops.asarray(not_missing)
  767. d_scores = (scores-truths) / scores.shape[0]
  768. d_scores *= not_missing
  769. mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
  770. return mean_square_error, d_scores
  771. def add_label(self, label):
  772. if label in self.labels:
  773. return 0
  774. if self.model not in (None, True, False):
  775. smaller = self.model._layers[-1]
  776. larger = Affine(len(self.labels)+1, smaller.nI)
  777. copy_array(larger.W[:smaller.nO], smaller.W)
  778. copy_array(larger.b[:smaller.nO], smaller.b)
  779. self.model._layers[-1] = larger
  780. self.labels.append(label)
  781. return 1
  782. def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
  783. **kwargs):
  784. if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
  785. token_vector_width = pipeline[0].model.nO
  786. else:
  787. token_vector_width = 64
  788. if self.model is True:
  789. self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
  790. self.model = self.Model(len(self.labels), token_vector_width,
  791. **self.cfg)
  792. link_vectors_to_models(self.vocab)
  793. if sgd is None:
  794. sgd = self.create_optimizer()
  795. return sgd
  796. cdef class DependencyParser(Parser):
  797. name = 'parser'
  798. TransitionSystem = ArcEager
  799. @property
  800. def postprocesses(self):
  801. return [nonproj.deprojectivize]
  802. def add_multitask_objective(self, target):
  803. labeller = MultitaskObjective(self.vocab, target=target)
  804. self._multitasks.append(labeller)
  805. def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
  806. for labeller in self._multitasks:
  807. tok2vec = self.model[0]
  808. labeller.begin_training(gold_tuples, pipeline=pipeline,
  809. tok2vec=tok2vec, sgd=sgd)
  810. def __reduce__(self):
  811. return (DependencyParser, (self.vocab, self.moves, self.model),
  812. None, None)
  813. cdef class EntityRecognizer(Parser):
  814. name = 'ner'
  815. TransitionSystem = BiluoPushDown
  816. nr_feature = 6
  817. def add_multitask_objective(self, target):
  818. labeller = MultitaskObjective(self.vocab, target=target)
  819. self._multitasks.append(labeller)
  820. def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
  821. for labeller in self._multitasks:
  822. tok2vec = self.model[0]
  823. labeller.begin_training(gold_tuples, pipeline=pipeline,
  824. tok2vec=tok2vec)
  825. def __reduce__(self):
  826. return (EntityRecognizer, (self.vocab, self.moves, self.model),
  827. None, None)
  828. __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']