import copy import numpy from .neural._classes.model import Model from . import check from .check import equal_axis from . import describe class FunctionLayer(Model): '''Wrap functions into weightless Model instances, for use as network components.''' def __init__(self, begin_update, predict=None, predict_one=None, nI=None, nO=None, *args, **kwargs): self.begin_update = begin_update if predict is not None: self.predict = predict if predict_one is not None: self.predict_one = predict_one self.nI = nI self.nO = nO Model.__init__(self) def _run_child_hooks(model, X, y): for layer in model._layers: for hook in layer.on_data_hooks: hook(layer, X, y) X = layer(X) @describe.on_data(_run_child_hooks) class FeedForward(Model): '''A feed-forward network, that chains multiple Model instances together.''' name = 'feed-forward' def __init__(self, layers, **kwargs): self._layers = [] for layer in layers: if isinstance(layer, FeedForward): self._layers.extend(layer._layers) else: self._layers.append(layer) Model.__init__(self, **kwargs) @property def input_shape(self): return self._layers[0].input_shape @property def output_shape(self): return self._layers[-1].output_shape def predict(self, X): for layer in self._layers: X = layer(X) return X def begin_update(self, X, drop=0.): callbacks = [] for layer in self._layers: X, inc_layer_grad = layer.begin_update(X, drop=drop) callbacks.append(inc_layer_grad) def continue_update(gradient, sgd=None): for callback in reversed(callbacks): if gradient is None or callback == None: break gradient = callback(gradient, sgd) return gradient return X, continue_update def layerize(begin_update=None, predict=None, *args, **kwargs): '''Wrap a function into a layer''' if begin_update is not None: return FunctionLayer(begin_update, predict=predict, *args, **kwargs) def wrapper(begin_update): return FunctionLayer(begin_update, *args, **kwargs) return wrapper def metalayerize(user_func): '''Wrap a function over a sequence of layers and an input into a layer.''' def returned(layers, *args, **kwargs): def begin_update(X, *args, **kwargs): return user_func(layers, X, *args, **kwargs) return FunctionLayer(begin_update, *args, **kwargs) return returned @layerize def flatten_add_lengths(seqs, pad=0, drop=0.): ops = Model.ops lengths = ops.asarray([len(seq) for seq in seqs], dtype='i') def finish_update(d_X, sgd=None): return ops.unflatten(d_X, lengths, pad=pad) X = ops.flatten(seqs, pad=pad) return (X, lengths), finish_update def remap_ids(ops=None, column=0): id_map = {0: 0} def remap_ids_fwd(ids, drop=0.): ids = ids[:, column] if not isinstance(ids, numpy.ndarray): ids = ids.get() n_vector = len(id_map) for i, id_ in enumerate(ids): id_ = int(id_) if id_ not in id_map: id_map[id_] = n_vector n_vector += 1 ids[i] = id_map[id_] return ops.asarray(ids), None model = layerize(remap_ids_fwd) if ops is None: ops = model.ops return model def with_getitem(idx, layer): def begin_update(items, drop=0.): X, finish = layer.begin_update(items[idx], drop=drop) return items[:idx] + (X,) + items[idx+1:], finish model = layerize(begin_update) model._layers.append(layer) def on_data(self, items, y): for hook in layer.on_data_hooks: hook(layer, items[idx], y) model.on_data_hooks.append(on_data) return model def noop(*layers): '''Transform a sequences of layers into a null operation.''' def begin_update(X, drop=0.): return X, lambda D, *a, **k: D return begin_update def chain(*layers): '''Compose two models `f` and `g` such that they become layers of a single feed-forward model that computes `g(f(x))`. Raises exception if their dimensions don't match. ''' if len(layers) == 0: return FeedForward([]) elif len(layers) == 1: return layers[0] else: return FeedForward(layers) def clone(orig, n): '''Construct `n` copies of a layer, with distinct weights. i.e. `clone(f, 3)(x)` computes `f(f'(f''(x)))`. ''' if n == 0: return layerize(noop()) layers = [orig] for i in range(n-1): layers.append(copy.deepcopy(orig)) layers[-1].set_id() return FeedForward(layers) def concatenate(*layers): # pragma: no cover '''Compose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` ''' if not layers: return noop() ops = layers[0].ops def begin_update(X, *a, **k): forward, backward = split_backward(layers) values = [fwd(X, *a, **k) for fwd in forward] output = ops.xp.hstack(values) shapes = [val.shape for val in values] def finish_update(gradient, *args, **kwargs): layer_grads = [] start = 0 for bwd, shape in zip(backward, shapes): end = start + shape[1] if bwd is not None: d = bwd(ops.xp.ascontiguousarray(gradient[:, start : end]), *args, **kwargs) if d is not None and hasattr(X, 'shape'): if not layer_grads: layer_grads.append(d) else: layer_grads[-1] += d start = end if layer_grads: return ops.asarray(layer_grads[-1]) else: return None return output, finish_update layer = FunctionLayer(begin_update) layer._layers = list(layers) def on_data(self, X, y=None): for layer in self._layers: for hook in layer.on_data_hooks: hook(layer, X, y) layer.on_data_hooks.append(on_data) return layer def add(*layers): if not layers: return noop() ops = layers[0].ops def forward(X, drop=0.): outs, callbacks = zip(*[lyr.begin_update(X, drop=drop) for lyr in layers]) out = outs[0] for o in outs: out += o def backward(d_out, sgd=None): grads = [bp(d_out, sgd=sgd) for bp in callbacks if bp is not None] grads = [g for g in grads if g is not None] if grads: total = grads[0] for g in grads: total += g return total else: return None return out, backward model = layerize(forward) model._layers = list(layers) def on_data(self, X, y): for layer in layers: for hook in layer.on_data_hooks: hook(layer, X, y) model.on_data_hooks.append(on_data) return model def split_backward(layers): # pragma: no cover '''Separate a sequence of layers' `begin_update` methods into two lists of functions: one that computes the forward values, and the other that completes the backward pass. The backward sequence is only populated after the forward functions have been applied. ''' backward = [] forward = [sink_return(op.begin_update, backward.append) for op in layers] return forward, backward def sink_return(func, sink, splitter=None): # pragma: no cover '''Transform a function `func` that returns tuples into a function that returns single values. Call a function `sink` on the unused values. ''' def wrap(*args, **kwargs): output = func(*args, **kwargs) if splitter is None: to_keep, to_sink = output else: to_keep, to_sink = splitter(*output) sink(to_sink) return to_keep return wrap def Arg(i): @layerize def begin_update(batched_inputs, drop=0.): inputs = list(zip(*batched_inputs)) return inputs[i], None return begin_update def with_flatten(layer, pad=0, ndim=4): def begin_update(seqs_in, drop=0.): lengths = layer.ops.asarray([len(seq) for seq in seqs_in]) X, bp_layer = layer.begin_update(layer.ops.flatten(seqs_in, pad=pad), drop=drop) if bp_layer is None: return layer.ops.unflatten(X, lengths, pad=pad), None def finish_update(d_seqs_out, sgd=None): d_X = bp_layer(layer.ops.flatten(d_seqs_out, pad=pad), sgd=sgd) if d_X is None: return None else: return layer.ops.unflatten(d_X, lengths, pad=pad) return layer.ops.unflatten(X, lengths, pad=pad), finish_update def predict(seqs_in): lengths = layer.ops.asarray([len(seq) for seq in seqs_in]) X = layer(layer.ops.flatten(seqs_in, pad=pad)) return layer.ops.unflatten(X, lengths, pad=pad) model = layerize(begin_update, predict=predict) model._layers.append(layer) model.on_data_hooks.append(_with_flatten_on_data) model.name = 'flatten' return model def _with_flatten_on_data(model, X, y): X = model.ops.flatten(X) for layer in model._layers: for hook in layer.on_data_hooks: hook(layer, X, y) X = layer(X) def get_word_ids(ops, pad=1, token_drop=0., ignore=None): # TODO: Is this made obsolete by the FeatureExtractor? def forward(docs, drop=0.): '''Get word forms.''' seqs = [] ops = Model.ops for doc in docs: if ignore is not None: doc = [token for token in doc if not ignore(token)] #seq = [0] * pad seq = [(token.lex_id or token.orth) for token in doc] #seq += [0] * pad seqs.append(ops.asarray(seq, dtype='uint64')) return seqs, None return layerize(forward) def FeatureExtracter(attrs, ops=None): if ops is None: ops = Model.ops def feature_extracter_fwd(docs, drop=0.): # Handle spans def get_feats(doc): if hasattr(doc, 'to_array'): return doc.to_array(attrs) else: return doc.doc.to_array(attrs)[doc.start:doc.end] features = [ops.asarray(get_feats(doc), dtype='uint64') for doc in docs] def feature_extracter_bwd(d_features, sgd=None): return d_features return features, feature_extracter_bwd return layerize(feature_extracter_fwd) def wrap(func, *child_layers): model = layerize(func) model._layers.extend(child_layers) def on_data(self, X, y): for child in self._layers: for hook in child.on_data_hooks: hook(child, X, y) model.on_data_hooks.append(on_data) return model def uniqued(layer, column=0): '''Group inputs to a layer, so that the layer only has to compute for the unique values. The data is transformed back before output, and the same transformation is applied for the gradient. Effectively, this is a cache local to each minibatch. The uniqued wrapper is useful for word inputs, because common words are seen often, but we may want to compute complicated features for the words, using e.g. character LSTM. ''' def uniqued_fwd(X, drop=0.): keys = X[:, column] keys = layer.ops.xp.ascontiguousarray(keys) if not isinstance(keys, numpy.ndarray): keys = keys.get() uniq_keys, ind, inv, counts = numpy.unique(keys, return_index=True, return_inverse=True, return_counts=True) X_uniq = layer.ops.xp.ascontiguousarray(X[ind]) Y_uniq, bp_Y_uniq = layer.begin_update(X_uniq, drop=drop) Y = Y_uniq[inv].reshape((X.shape[0],) + Y_uniq.shape[1:]) def uniqued_bwd(dY, sgd=None): dY_uniq = layer.ops.allocate(Y_uniq.shape, dtype='f') layer.ops.scatter_add(dY_uniq, layer.ops.asarray(inv, dtype='i'), dY) d_uniques = bp_Y_uniq(dY_uniq, sgd=sgd) if d_uniques is not None: dX = (d_uniques / counts)[inv] return dX else: return None return Y, uniqued_bwd model = wrap(uniqued_fwd, layer) return model def foreach(layer, drop_factor=1.0): '''Map a layer across list items''' def foreach_fwd(docs, drop=0.): sents = [] lengths = [] for doc in docs: doc_sents = [sent for sent in doc if len(sent)] subset = [s for s in doc_sents if numpy.random.random() >= drop * drop_factor] if subset: sents.extend(subset) lengths.append(len(subset)) else: numpy.random.shuffle(doc_sents) sents.append(doc_sents[0]) lengths.append(1) flat, bp_flat = layer.begin_update(sents, drop=0.) output = layer.ops.unflatten(flat, lengths) def foreach_bwd(d_output, sgd=None): d_flat = layer.ops.flatten(d_output) d_sents = bp_flat(d_flat, sgd=sgd) if d_sents is None: return d_sents else: return layer.ops.unflatten(d_sents, lengths) return output, foreach_bwd model = wrap(foreach_fwd, layer) def _run_foreach_child_hooks(model, X, y): for layer in model._layers: for hook in layer.on_data_hooks: hook(layer, X[0], y[0]) model.on_data_hooks = [_run_foreach_child_hooks] return model def foreach_sentence(layer, drop_factor=1.0): '''Map a layer across sentences (assumes spaCy-esque .sents interface)''' def sentence_fwd(docs, drop=0.): sents = [] lengths = [] for doc in docs: doc_sents = [sent for sent in doc.sents if len(sent)] subset = [s for s in doc_sents if numpy.random.random() >= drop * drop_factor] if subset: sents.extend(subset) lengths.append(len(subset)) else: numpy.random.shuffle(doc_sents) sents.append(doc_sents[0]) lengths.append(1) flat, bp_flat = layer.begin_update(sents, drop=0.) output = layer.ops.unflatten(flat, lengths) def sentence_bwd(d_output, sgd=None): d_flat = layer.ops.flatten(d_output) d_sents = bp_flat(d_flat, sgd=sgd) if d_sents is None: return d_sents else: return layer.ops.unflatten(d_sents, lengths) return output, sentence_bwd model = wrap(sentence_fwd, layer) return model