# coding: utf8
|
|
from __future__ import unicode_literals, print_function
|
|
|
|
import os
|
|
import ujson
|
|
import pkg_resources
|
|
import importlib
|
|
import regex as re
|
|
from pathlib import Path
|
|
import sys
|
|
import textwrap
|
|
import random
|
|
from collections import OrderedDict
|
|
from thinc.neural._classes.model import Model
|
|
import functools
|
|
import cytoolz
|
|
import itertools
|
|
import numpy.random
|
|
|
|
from .symbols import ORTH
|
|
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
|
from .compat import import_file
|
|
from .errors import Errors
|
|
|
|
# Import these directly from Thinc, so that we're sure we always have the
|
|
# same version.
|
|
from thinc.neural._classes.model import msgpack
|
|
from thinc.neural._classes.model import msgpack_numpy
|
|
|
|
|
|
LANGUAGES = {}
|
|
_data_path = Path(__file__).parent / 'data'
|
|
_PRINT_ENV = False
|
|
|
|
|
|
def set_env_log(value):
|
|
global _PRINT_ENV
|
|
_PRINT_ENV = value
|
|
|
|
|
|
def get_lang_class(lang):
|
|
"""Import and load a Language class.
|
|
|
|
lang (unicode): Two-letter language code, e.g. 'en'.
|
|
RETURNS (Language): Language class.
|
|
"""
|
|
global LANGUAGES
|
|
if lang not in LANGUAGES:
|
|
try:
|
|
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
|
except ImportError:
|
|
raise ImportError(Errors.E048.format(lang=lang))
|
|
LANGUAGES[lang] = getattr(module, module.__all__[0])
|
|
return LANGUAGES[lang]
|
|
|
|
|
|
def set_lang_class(name, cls):
|
|
"""Set a custom Language class name that can be loaded via get_lang_class.
|
|
|
|
name (unicode): Name of Language class.
|
|
cls (Language): Language class.
|
|
"""
|
|
global LANGUAGES
|
|
LANGUAGES[name] = cls
|
|
|
|
|
|
def get_data_path(require_exists=True):
|
|
"""Get path to spaCy data directory.
|
|
|
|
require_exists (bool): Only return path if it exists, otherwise None.
|
|
RETURNS (Path or None): Data path or None.
|
|
"""
|
|
if not require_exists:
|
|
return _data_path
|
|
else:
|
|
return _data_path if _data_path.exists() else None
|
|
|
|
|
|
def set_data_path(path):
|
|
"""Set path to spaCy data directory.
|
|
|
|
path (unicode or Path): Path to new data directory.
|
|
"""
|
|
global _data_path
|
|
_data_path = ensure_path(path)
|
|
|
|
|
|
def ensure_path(path):
|
|
"""Ensure string is converted to a Path.
|
|
|
|
path: Anything. If string, it's converted to Path.
|
|
RETURNS: Path or original argument.
|
|
"""
|
|
if isinstance(path, basestring_):
|
|
return Path(path)
|
|
else:
|
|
return path
|
|
|
|
|
|
def load_model(name, **overrides):
|
|
"""Load a model from a shortcut link, package or data path.
|
|
|
|
name (unicode): Package name, shortcut link or model path.
|
|
**overrides: Specific overrides, like pipeline components to disable.
|
|
RETURNS (Language): `Language` class with the loaded model.
|
|
"""
|
|
data_path = get_data_path()
|
|
if not data_path or not data_path.exists():
|
|
raise IOError(Errors.E049.format(path=path2str(data_path)))
|
|
if isinstance(name, basestring_): # in data dir / shortcut
|
|
if name in set([d.name for d in data_path.iterdir()]):
|
|
return load_model_from_link(name, **overrides)
|
|
if is_package(name): # installed as package
|
|
return load_model_from_package(name, **overrides)
|
|
if Path(name).exists(): # path to model data directory
|
|
return load_model_from_path(Path(name), **overrides)
|
|
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
|
return load_model_from_path(name, **overrides)
|
|
raise IOError(Errors.E050.format(name=name))
|
|
|
|
|
|
def load_model_from_link(name, **overrides):
|
|
"""Load a model from a shortcut link, or directory in spaCy data path."""
|
|
path = get_data_path() / name / '__init__.py'
|
|
try:
|
|
cls = import_file(name, path)
|
|
except AttributeError:
|
|
raise IOError(Errors.E051.format(name=name))
|
|
return cls.load(**overrides)
|
|
|
|
|
|
def load_model_from_package(name, **overrides):
|
|
"""Load a model from an installed package."""
|
|
cls = importlib.import_module(name)
|
|
return cls.load(**overrides)
|
|
|
|
|
|
def load_model_from_path(model_path, meta=False, **overrides):
|
|
"""Load a model from a data directory path. Creates Language class with
|
|
pipeline from meta.json and then calls from_disk() with path."""
|
|
if not meta:
|
|
meta = get_model_meta(model_path)
|
|
cls = get_lang_class(meta['lang'])
|
|
nlp = cls(meta=meta, **overrides)
|
|
pipeline = meta.get('pipeline', [])
|
|
disable = overrides.get('disable', [])
|
|
if pipeline is True:
|
|
pipeline = nlp.Defaults.pipe_names
|
|
elif pipeline in (False, None):
|
|
pipeline = []
|
|
for name in pipeline:
|
|
if name not in disable:
|
|
config = meta.get('pipeline_args', {}).get(name, {})
|
|
component = nlp.create_pipe(name, config=config)
|
|
nlp.add_pipe(component, name=name)
|
|
return nlp.from_disk(model_path)
|
|
|
|
|
|
def load_model_from_init_py(init_file, **overrides):
|
|
"""Helper function to use in the `load()` method of a model package's
|
|
__init__.py.
|
|
|
|
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
|
|
**overrides: Specific overrides, like pipeline components to disable.
|
|
RETURNS (Language): `Language` class with loaded model.
|
|
"""
|
|
model_path = Path(init_file).parent
|
|
meta = get_model_meta(model_path)
|
|
data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
|
|
data_path = model_path / data_dir
|
|
if not model_path.exists():
|
|
raise IOError(Errors.E052.format(path=path2str(data_path)))
|
|
return load_model_from_path(data_path, meta, **overrides)
|
|
|
|
|
|
def get_model_meta(path):
|
|
"""Get model meta.json from a directory path and validate its contents.
|
|
|
|
path (unicode or Path): Path to model directory.
|
|
RETURNS (dict): The model's meta data.
|
|
"""
|
|
model_path = ensure_path(path)
|
|
if not model_path.exists():
|
|
raise IOError(Errors.E052.format(path=path2str(model_path)))
|
|
meta_path = model_path / 'meta.json'
|
|
if not meta_path.is_file():
|
|
raise IOError(Errors.E053.format(path=meta_path))
|
|
meta = read_json(meta_path)
|
|
for setting in ['lang', 'name', 'version']:
|
|
if setting not in meta or not meta[setting]:
|
|
raise ValueError(Errors.E054.format(setting=setting))
|
|
return meta
|
|
|
|
|
|
def is_package(name):
|
|
"""Check if string maps to a package installed via pip.
|
|
|
|
name (unicode): Name of package.
|
|
RETURNS (bool): True if installed package, False if not.
|
|
"""
|
|
name = name.lower() # compare package name against lowercase name
|
|
packages = pkg_resources.working_set.by_key.keys()
|
|
for package in packages:
|
|
if package.lower().replace('-', '_') == name:
|
|
return True
|
|
return False
|
|
|
|
|
|
def get_package_path(name):
|
|
"""Get the path to an installed package.
|
|
|
|
name (unicode): Package name.
|
|
RETURNS (Path): Path to installed package.
|
|
"""
|
|
name = name.lower() # use lowercase version to be safe
|
|
# Here we're importing the module just to find it. This is worryingly
|
|
# indirect, but it's otherwise very difficult to find the package.
|
|
pkg = importlib.import_module(name)
|
|
return Path(pkg.__file__).parent
|
|
|
|
|
|
def is_in_jupyter():
|
|
"""Check if user is running spaCy from a Jupyter notebook by detecting the
|
|
IPython kernel. Mainly used for the displaCy visualizer.
|
|
|
|
RETURNS (bool): True if in Jupyter, False if not.
|
|
"""
|
|
try:
|
|
cfg = get_ipython().config
|
|
if cfg['IPKernelApp']['parent_appname'] == 'ipython-notebook':
|
|
return True
|
|
except NameError:
|
|
return False
|
|
return False
|
|
|
|
|
|
def get_cuda_stream(require=False):
|
|
return CudaStream() if CudaStream is not None else None
|
|
|
|
|
|
def get_async(stream, numpy_array):
|
|
if cupy is None:
|
|
return numpy_array
|
|
else:
|
|
array = cupy.ndarray(numpy_array.shape, order='C',
|
|
dtype=numpy_array.dtype)
|
|
array.set(numpy_array, stream=stream)
|
|
return array
|
|
|
|
|
|
def env_opt(name, default=None):
|
|
if type(default) is float:
|
|
type_convert = float
|
|
else:
|
|
type_convert = int
|
|
if 'SPACY_' + name.upper() in os.environ:
|
|
value = type_convert(os.environ['SPACY_' + name.upper()])
|
|
if _PRINT_ENV:
|
|
print(name, "=", repr(value), "via", "$SPACY_" + name.upper())
|
|
return value
|
|
elif name in os.environ:
|
|
value = type_convert(os.environ[name])
|
|
if _PRINT_ENV:
|
|
print(name, "=", repr(value), "via", '$' + name)
|
|
return value
|
|
else:
|
|
if _PRINT_ENV:
|
|
print(name, '=', repr(default), "by default")
|
|
return default
|
|
|
|
|
|
def read_regex(path):
|
|
path = ensure_path(path)
|
|
with path.open() as file_:
|
|
entries = file_.read().split('\n')
|
|
expression = '|'.join(['^' + re.escape(piece)
|
|
for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def compile_prefix_regex(entries):
|
|
if '(' in entries:
|
|
# Handle deprecated data
|
|
expression = '|'.join(['^' + re.escape(piece)
|
|
for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
else:
|
|
expression = '|'.join(['^' + piece
|
|
for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def compile_suffix_regex(entries):
|
|
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def compile_infix_regex(entries):
|
|
expression = '|'.join([piece for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def add_lookups(default_func, *lookups):
|
|
"""Extend an attribute function with special cases. If a word is in the
|
|
lookups, the value is returned. Otherwise the previous function is used.
|
|
|
|
default_func (callable): The default function to execute.
|
|
*lookups (dict): Lookup dictionary mapping string to attribute value.
|
|
RETURNS (callable): Lexical attribute getter.
|
|
"""
|
|
# This is implemented as functools.partial instead of a closure, to allow
|
|
# pickle to work.
|
|
return functools.partial(_get_attr_unless_lookup, default_func, lookups)
|
|
|
|
|
|
def _get_attr_unless_lookup(default_func, lookups, string):
|
|
for lookup in lookups:
|
|
if string in lookup:
|
|
return lookup[string]
|
|
return default_func(string)
|
|
|
|
|
|
def update_exc(base_exceptions, *addition_dicts):
|
|
"""Update and validate tokenizer exceptions. Will overwrite exceptions.
|
|
|
|
base_exceptions (dict): Base exceptions.
|
|
*addition_dicts (dict): Exceptions to add to the base dict, in order.
|
|
RETURNS (dict): Combined tokenizer exceptions.
|
|
"""
|
|
exc = dict(base_exceptions)
|
|
for additions in addition_dicts:
|
|
for orth, token_attrs in additions.items():
|
|
if not all(isinstance(attr[ORTH], unicode_)
|
|
for attr in token_attrs):
|
|
raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
|
|
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
|
if orth != described_orth:
|
|
raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
|
|
exc.update(additions)
|
|
exc = expand_exc(exc, "'", "’")
|
|
return exc
|
|
|
|
|
|
def expand_exc(excs, search, replace):
|
|
"""Find string in tokenizer exceptions, duplicate entry and replace string.
|
|
For example, to add additional versions with typographic apostrophes.
|
|
|
|
excs (dict): Tokenizer exceptions.
|
|
search (unicode): String to find and replace.
|
|
replace (unicode): Replacement.
|
|
RETURNS (dict): Combined tokenizer exceptions.
|
|
"""
|
|
def _fix_token(token, search, replace):
|
|
fixed = dict(token)
|
|
fixed[ORTH] = fixed[ORTH].replace(search, replace)
|
|
return fixed
|
|
new_excs = dict(excs)
|
|
for token_string, tokens in excs.items():
|
|
if search in token_string:
|
|
new_key = token_string.replace(search, replace)
|
|
new_value = [_fix_token(t, search, replace) for t in tokens]
|
|
new_excs[new_key] = new_value
|
|
return new_excs
|
|
|
|
|
|
def normalize_slice(length, start, stop, step=None):
|
|
if not (step is None or step == 1):
|
|
raise ValueError(Errors.E057)
|
|
if start is None:
|
|
start = 0
|
|
elif start < 0:
|
|
start += length
|
|
start = min(length, max(0, start))
|
|
if stop is None:
|
|
stop = length
|
|
elif stop < 0:
|
|
stop += length
|
|
stop = min(length, max(start, stop))
|
|
return start, stop
|
|
|
|
|
|
def minibatch(items, size=8):
|
|
"""Iterate over batches of items. `size` may be an iterator,
|
|
so that batch-size can vary on each step.
|
|
"""
|
|
if isinstance(size, int):
|
|
size_ = itertools.repeat(size)
|
|
else:
|
|
size_ = size
|
|
items = iter(items)
|
|
while True:
|
|
batch_size = next(size_)
|
|
batch = list(cytoolz.take(int(batch_size), items))
|
|
if len(batch) == 0:
|
|
break
|
|
yield list(batch)
|
|
|
|
|
|
def compounding(start, stop, compound):
|
|
"""Yield an infinite series of compounding values. Each time the
|
|
generator is called, a value is produced by multiplying the previous
|
|
value by the compound rate.
|
|
|
|
EXAMPLE:
|
|
>>> sizes = compounding(1., 10., 1.5)
|
|
>>> assert next(sizes) == 1.
|
|
>>> assert next(sizes) == 1 * 1.5
|
|
>>> assert next(sizes) == 1.5 * 1.5
|
|
"""
|
|
def clip(value):
|
|
return max(value, stop) if (start > stop) else min(value, stop)
|
|
curr = float(start)
|
|
while True:
|
|
yield clip(curr)
|
|
curr *= compound
|
|
|
|
|
|
def decaying(start, stop, decay):
|
|
"""Yield an infinite series of linearly decaying values."""
|
|
def clip(value):
|
|
return max(value, stop) if (start > stop) else min(value, stop)
|
|
nr_upd = 1.
|
|
while True:
|
|
yield clip(start * 1./(1. + decay * nr_upd))
|
|
nr_upd += 1
|
|
|
|
|
|
def itershuffle(iterable, bufsize=1000):
|
|
"""Shuffle an iterator. This works by holding `bufsize` items back
|
|
and yielding them sometime later. Obviously, this is not unbiased –
|
|
but should be good enough for batching. Larger bufsize means less bias.
|
|
From https://gist.github.com/andres-erbsen/1307752
|
|
|
|
iterable (iterable): Iterator to shuffle.
|
|
bufsize (int): Items to hold back.
|
|
YIELDS (iterable): The shuffled iterator.
|
|
"""
|
|
iterable = iter(iterable)
|
|
buf = []
|
|
try:
|
|
while True:
|
|
for i in range(random.randint(1, bufsize-len(buf))):
|
|
buf.append(iterable.next())
|
|
random.shuffle(buf)
|
|
for i in range(random.randint(1, bufsize)):
|
|
if buf:
|
|
yield buf.pop()
|
|
else:
|
|
break
|
|
except StopIteration:
|
|
random.shuffle(buf)
|
|
while buf:
|
|
yield buf.pop()
|
|
raise StopIteration
|
|
|
|
|
|
def read_json(location):
|
|
"""Open and load JSON from file.
|
|
|
|
location (Path): Path to JSON file.
|
|
RETURNS (dict): Loaded JSON content.
|
|
"""
|
|
location = ensure_path(location)
|
|
with location.open('r', encoding='utf8') as f:
|
|
return ujson.load(f)
|
|
|
|
|
|
def get_raw_input(description, default=False):
|
|
"""Get user input from the command line via raw_input / input.
|
|
|
|
description (unicode): Text to display before prompt.
|
|
default (unicode or False/None): Default value to display with prompt.
|
|
RETURNS (unicode): User input.
|
|
"""
|
|
additional = ' (default: %s)' % default if default else ''
|
|
prompt = ' %s%s: ' % (description, additional)
|
|
user_input = input_(prompt)
|
|
return user_input
|
|
|
|
|
|
def to_bytes(getters, exclude):
|
|
serialized = OrderedDict()
|
|
for key, getter in getters.items():
|
|
if key not in exclude:
|
|
serialized[key] = getter()
|
|
return msgpack.dumps(serialized, use_bin_type=True, encoding='utf8')
|
|
|
|
|
|
def from_bytes(bytes_data, setters, exclude):
|
|
msg = msgpack.loads(bytes_data, raw=False)
|
|
for key, setter in setters.items():
|
|
if key not in exclude and key in msg:
|
|
setter(msg[key])
|
|
return msg
|
|
|
|
|
|
def to_disk(path, writers, exclude):
|
|
path = ensure_path(path)
|
|
if not path.exists():
|
|
path.mkdir()
|
|
for key, writer in writers.items():
|
|
if key not in exclude:
|
|
writer(path / key)
|
|
return path
|
|
|
|
|
|
def from_disk(path, readers, exclude):
|
|
path = ensure_path(path)
|
|
for key, reader in readers.items():
|
|
if key not in exclude:
|
|
reader(path / key)
|
|
return path
|
|
|
|
|
|
def print_table(data, title=None):
|
|
"""Print data in table format.
|
|
|
|
data (dict or list of tuples): Label/value pairs.
|
|
title (unicode or None): Title, will be printed above.
|
|
"""
|
|
if isinstance(data, dict):
|
|
data = list(data.items())
|
|
tpl_row = ' {:<15}' * len(data[0])
|
|
table = '\n'.join([tpl_row.format(l, unicode_(v)) for l, v in data])
|
|
if title:
|
|
print('\n \033[93m{}\033[0m'.format(title))
|
|
print('\n{}\n'.format(table))
|
|
|
|
|
|
def print_markdown(data, title=None):
|
|
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
|
|
|
data (dict or list of tuples): Label/value pairs.
|
|
title (unicode or None): Title, will be rendered as headline 2.
|
|
"""
|
|
def excl_value(value):
|
|
# contains path, i.e. personal info
|
|
return isinstance(value, basestring_) and Path(value).exists()
|
|
|
|
if isinstance(data, dict):
|
|
data = list(data.items())
|
|
markdown = ["* **{}:** {}".format(l, unicode_(v))
|
|
for l, v in data if not excl_value(v)]
|
|
if title:
|
|
print("\n## {}".format(title))
|
|
print('\n{}\n'.format('\n'.join(markdown)))
|
|
|
|
|
|
def prints(*texts, **kwargs):
|
|
"""Print formatted message (manual ANSI escape sequences to avoid
|
|
dependency)
|
|
|
|
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
|
|
**kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
|
|
"""
|
|
exits = kwargs.get('exits', None)
|
|
title = kwargs.get('title', None)
|
|
title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
|
|
message = '\n\n'.join([_wrap(text) for text in texts])
|
|
print('\n{}{}\n'.format(title, message))
|
|
if exits is not None:
|
|
sys.exit(exits)
|
|
|
|
|
|
def _wrap(text, wrap_max=80, indent=4):
|
|
"""Wrap text at given width using textwrap module.
|
|
|
|
text (unicode): Text to wrap. If it's a Path, it's converted to string.
|
|
wrap_max (int): Maximum line length (indent is deducted).
|
|
indent (int): Number of spaces for indentation.
|
|
RETURNS (unicode): Wrapped text.
|
|
"""
|
|
indent = indent * ' '
|
|
wrap_width = wrap_max - len(indent)
|
|
if isinstance(text, Path):
|
|
text = path2str(text)
|
|
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
|
|
subsequent_indent=indent, break_long_words=False,
|
|
break_on_hyphens=False)
|
|
|
|
|
|
def minify_html(html):
|
|
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
|
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
|
newlines.
|
|
|
|
html (unicode): Markup to minify.
|
|
RETURNS (unicode): "Minified" HTML.
|
|
"""
|
|
return html.strip().replace(' ', '').replace('\n', '')
|
|
|
|
|
|
def escape_html(text):
|
|
"""Replace <, >, &, " with their HTML encoded representation. Intended to
|
|
prevent HTML errors in rendered displaCy markup.
|
|
|
|
text (unicode): The original text.
|
|
RETURNS (unicode): Equivalent text to be safely used within HTML.
|
|
"""
|
|
text = text.replace('&', '&')
|
|
text = text.replace('<', '<')
|
|
text = text.replace('>', '>')
|
|
text = text.replace('"', '"')
|
|
return text
|
|
|
|
|
|
def use_gpu(gpu_id):
|
|
try:
|
|
import cupy.cuda.device
|
|
except ImportError:
|
|
return None
|
|
from thinc.neural.ops import CupyOps
|
|
device = cupy.cuda.device.Device(gpu_id)
|
|
device.use()
|
|
Model.ops = CupyOps()
|
|
Model.Ops = CupyOps
|
|
return device
|
|
|
|
|
|
def fix_random_seed(seed=0):
|
|
random.seed(seed)
|
|
numpy.random.seed(seed)
|
|
|
|
|
|
class SimpleFrozenDict(dict):
|
|
"""Simplified implementation of a frozen dict, mainly used as default
|
|
function or method argument (for arguments that should default to empty
|
|
dictionary). Will raise an error if user or spaCy attempts to add to dict.
|
|
"""
|
|
def __setitem__(self, key, value):
|
|
raise NotImplementedError(Errors.E095)
|
|
|
|
def pop(self, key, default=None):
|
|
raise NotImplementedError(Errors.E095)
|
|
|
|
def update(self, other):
|
|
raise NotImplementedError(Errors.E095)
|