alpcentaur
/
basabuuka_prototyp

# cython: profile=True# coding: utf8from __future__ import unicode_literals, print_function
import reimport ujsonimport randomimport cytoolzimport itertools
from .syntax import nonprojfrom .tokens import Docfrom .errors import Errorsfrom . import utilfrom .util import minibatch

def tags_to_entities(tags):    entities = []    start = None    for i, tag in enumerate(tags):        if tag is None:            continue        if tag.startswith('O'):            # TODO: We shouldn't be getting these malformed inputs. Fix this.            if start is not None:                start = None            continue        elif tag == '-':            continue        elif tag.startswith('I'):            if start is None:                raise ValueError(Errors.E067.format(tags=tags[:i+1]))            continue        if tag.startswith('U'):            entities.append((tag[2:], i, i))        elif tag.startswith('B'):            start = i        elif tag.startswith('L'):            entities.append((tag[2:], start, i))            start = None        else:            raise ValueError(Errors.E068.format(tag=tag))    return entities

def merge_sents(sents):    m_deps = [[], [], [], [], [], []]    m_brackets = []    i = 0    for (ids, words, tags, heads, labels, ner), brackets in sents:        m_deps[0].extend(id_ + i for id_ in ids)        m_deps[1].extend(words)        m_deps[2].extend(tags)        m_deps[3].extend(head + i for head in heads)        m_deps[4].extend(labels)        m_deps[5].extend(ner)        m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])                          for b in brackets)        i += len(ids)    return [(m_deps, m_brackets)]

def align(cand_words, gold_words):    cost, edit_path = _min_edit_path(cand_words, gold_words)    alignment = []    i_of_gold = 0    for move in edit_path:        if move == 'M':            alignment.append(i_of_gold)            i_of_gold += 1        elif move == 'S':            alignment.append(None)            i_of_gold += 1        elif move == 'D':            alignment.append(None)        elif move == 'I':            i_of_gold += 1        else:            raise Exception(move)    return alignment

punct_re = re.compile(r'\W')

def _min_edit_path(cand_words, gold_words):    cdef:        Pool mem        int i, j, n_cand, n_gold        int* curr_costs        int* prev_costs
    # TODO: Fix this --- just do it properly, make the full edit matrix and    # then walk back over it...    # Preprocess inputs    cand_words = [punct_re.sub('', w).lower() for w in cand_words]    gold_words = [punct_re.sub('', w).lower() for w in gold_words]
    if cand_words == gold_words:        return 0, ''.join(['M' for _ in gold_words])    mem = Pool()    n_cand = len(cand_words)    n_gold = len(gold_words)    # Levenshtein distance, except we need the history, and we may want    # different costs. Mark operations with a string, and score the history    # using _edit_cost.    previous_row = []    prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))    curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))    for i in range(n_gold + 1):        cell = ''        for j in range(i):            cell += 'I'        previous_row.append('I' * i)        prev_costs[i] = i    for i, cand in enumerate(cand_words):        current_row = ['D' * (i + 1)]        curr_costs[0] = i+1        for j, gold in enumerate(gold_words):            if gold.lower() == cand.lower():                s_cost = prev_costs[j]                i_cost = curr_costs[j] + 1                d_cost = prev_costs[j + 1] + 1            else:                s_cost = prev_costs[j] + 1                i_cost = curr_costs[j] + 1                d_cost = prev_costs[j + 1] + (1 if cand else 0)
            if s_cost <= i_cost and s_cost <= d_cost:                best_cost = s_cost                best_hist = previous_row[j] + ('M' if gold == cand else 'S')            elif i_cost <= s_cost and i_cost <= d_cost:                best_cost = i_cost                best_hist = current_row[j] + 'I'            else:                best_cost = d_cost                best_hist = previous_row[j + 1] + 'D'
            current_row.append(best_hist)            curr_costs[j+1] = best_cost        previous_row = current_row        for j in range(len(gold_words) + 1):            prev_costs[j] = curr_costs[j]            curr_costs[j] = 0
    return prev_costs[n_gold], previous_row[-1]

class GoldCorpus(object):    """An annotated corpus, using the JSON file format. Manages
    annotations for tagging, dependency parsing and NER."""
    def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):        """Create a GoldCorpus.

        train_path (unicode or Path): File or directory of training data.        dev_path (unicode or Path): File or directory of development data.        RETURNS (GoldCorpus): The newly created object.        """
        self.train_path = util.ensure_path(train_path)        self.dev_path = util.ensure_path(dev_path)        self.limit = limit        self.train_locs = self.walk_corpus(self.train_path)        self.dev_locs = self.walk_corpus(self.dev_path)
    @property    def train_tuples(self):        i = 0        for loc in self.train_locs:            gold_tuples = read_json_file(loc)            for item in gold_tuples:                yield item                i += len(item[1])                if self.limit and i >= self.limit:                    break
    @property    def dev_tuples(self):        i = 0        for loc in self.dev_locs:            gold_tuples = read_json_file(loc)            for item in gold_tuples:                yield item                i += len(item[1])                if self.limit and i >= self.limit:                    break
    def count_train(self):        n = 0        i = 0        for raw_text, paragraph_tuples in self.train_tuples:            n += sum([len(s[0][1]) for s in paragraph_tuples])            if self.limit and i >= self.limit:                break            i += len(paragraph_tuples)        return n
    def train_docs(self, nlp, gold_preproc=False,                   projectivize=False, max_length=None,                   noise_level=0.0):        train_tuples = self.train_tuples        if projectivize:            train_tuples = nonproj.preprocess_training_data(                self.train_tuples, label_freq_cutoff=100)        random.shuffle(train_tuples)        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,                                        max_length=max_length,                                        noise_level=noise_level)        yield from gold_docs
    def dev_docs(self, nlp, gold_preproc=False):        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)        yield from gold_docs
    @classmethod    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,                       noise_level=0.0):        for raw_text, paragraph_tuples in tuples:            if gold_preproc:                raw_text = None            else:                paragraph_tuples = merge_sents(paragraph_tuples)            docs = cls._make_docs(nlp, raw_text, paragraph_tuples,                                  gold_preproc, noise_level=noise_level)            golds = cls._make_golds(docs, paragraph_tuples)            for doc, gold in zip(docs, golds):                if (not max_length) or len(doc) < max_length:                    yield doc, gold
    @classmethod    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,                   noise_level=0.0):        if raw_text is not None:            raw_text = add_noise(raw_text, noise_level)            return [nlp.make_doc(raw_text)]        else:            return [Doc(nlp.vocab,                        words=add_noise(sent_tuples[1], noise_level))                    for (sent_tuples, brackets) in paragraph_tuples]
    @classmethod    def _make_golds(cls, docs, paragraph_tuples):        if len(docs) != len(paragraph_tuples):            raise ValueError(Errors.E070.format(n_docs=len(docs),                                                n_annots=len(paragraph_tuples)))        if len(docs) == 1:            return [GoldParse.from_annot_tuples(docs[0],                                                paragraph_tuples[0][0])]        else:            return [GoldParse.from_annot_tuples(doc, sent_tuples)                    for doc, (sent_tuples, brackets)                    in zip(docs, paragraph_tuples)]
    @staticmethod    def walk_corpus(path):        if not path.is_dir():            return [path]        paths = [path]        locs = []        seen = set()        for path in paths:            if str(path) in seen:                continue            seen.add(str(path))            if path.parts[-1].startswith('.'):                continue            elif path.is_dir():                paths.extend(path.iterdir())            elif path.parts[-1].endswith('.json'):                locs.append(path)        return locs

def add_noise(orig, noise_level):    if random.random() >= noise_level:        return orig    elif type(orig) == list:        corrupted = [_corrupt(word, noise_level) for word in orig]        corrupted = [w for w in corrupted if w]        return corrupted    else:        return ''.join(_corrupt(c, noise_level) for c in orig)

def _corrupt(c, noise_level):    if random.random() >= noise_level:        return c    elif c == ' ':        return '\n'    elif c == '\n':        return ' '    elif c in ['.', "'", "!", "?"]:        return ''    else:        return c.lower()

def read_json_file(loc, docs_filter=None, limit=None):    loc = util.ensure_path(loc)    if loc.is_dir():        for filename in loc.iterdir():            yield from read_json_file(loc / filename, limit=limit)    else:        with loc.open('r', encoding='utf8') as file_:            docs = ujson.load(file_)        if limit is not None:            docs = docs[:limit]        for doc in docs:            if docs_filter is not None and not docs_filter(doc):                continue            paragraphs = []            for paragraph in doc['paragraphs']:                sents = []                for sent in paragraph['sentences']:                    words = []                    ids = []                    tags = []                    heads = []                    labels = []                    ner = []                    for i, token in enumerate(sent['tokens']):                        words.append(token['orth'])                        ids.append(i)                        tags.append(token.get('tag', '-'))                        heads.append(token.get('head', 0) + i)                        labels.append(token.get('dep', ''))                        # Ensure ROOT label is case-insensitive                        if labels[-1].lower() == 'root':                            labels[-1] = 'ROOT'                        ner.append(token.get('ner', '-'))                    sents.append([                        [ids, words, tags, heads, labels, ner],                        sent.get('brackets', [])])                if sents:                    yield [paragraph.get('raw', None), sents]

def iob_to_biluo(tags):    out = []    curr_label = None    tags = list(tags)    while tags:        out.extend(_consume_os(tags))        out.extend(_consume_ent(tags))    return out

def _consume_os(tags):    while tags and tags[0] == 'O':        yield tags.pop(0)

def _consume_ent(tags):    if not tags:        return []    tag = tags.pop(0)    target_in = 'I' + tag[1:]    target_last = 'L' + tag[1:]    length = 1    while tags and tags[0] in {target_in, target_last}:        length += 1        tags.pop(0)    label = tag[2:]    if length == 1:        return ['U-' + label]    else:        start = 'B-' + label        end = 'L-' + label        middle = ['I-%s' % label for _ in range(1, length - 1)]        return [start] + middle + [end]

cdef class GoldParse:    """Collection for training annotations."""    @classmethod    def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):        _, words, tags, heads, deps, entities = annot_tuples        return cls(doc, words=words, tags=tags, heads=heads, deps=deps,                   entities=entities, make_projective=make_projective)
    def __init__(self, doc, annot_tuples=None, words=None, tags=None,                 heads=None, deps=None, entities=None, make_projective=False,                 cats=None):        """Create a GoldParse.

        doc (Doc): The document the annotations refer to.        words (iterable): A sequence of unicode word strings.        tags (iterable): A sequence of strings, representing tag annotations.        heads (iterable): A sequence of integers, representing syntactic            head offsets.        deps (iterable): A sequence of strings, representing the syntactic            relation types.        entities (iterable): A sequence of named entity annotations, either as            BILUO tag strings, or as `(start_char, end_char, label)` tuples,            representing the entity positions.        cats (dict): Labels for text classification. Each key in the dictionary            may be a string or an int, or a `(start_char, end_char, label)`            tuple, indicating that the label is applied to only part of the            document (usually a sentence). Unlike entity annotations, label            annotations can overlap, i.e. a single word can be covered by            multiple labelled spans. The TextCategorizer component expects            true examples of a label to have the value 1.0, and negative            examples of a label to have the value 0.0. Labels not in the            dictionary are treated as missing - the gradient for those labels            will be zero.        RETURNS (GoldParse): The newly constructed object.        """
        if words is None:            words = [token.text for token in doc]        if tags is None:            tags = [None for _ in doc]        if heads is None:            heads = [None for token in doc]        if deps is None:            deps = [None for _ in doc]        if entities is None:            entities = [None for _ in doc]        elif len(entities) == 0:            entities = ['O' for _ in doc]        elif not isinstance(entities[0], basestring):            # Assume we have entities specified by character offset.            entities = biluo_tags_from_offsets(doc, entities)
        self.mem = Pool()        self.loss = 0        self.length = len(doc)
        # These are filled by the tagger/parser/entity recogniser        self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))        self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))        self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))        self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))        self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
        self.cats = {} if cats is None else dict(cats)        self.words = [None] * len(doc)        self.tags = [None] * len(doc)        self.heads = [None] * len(doc)        self.labels = [None] * len(doc)        self.ner = [None] * len(doc)
        self.cand_to_gold = align([t.orth_ for t in doc], words)        self.gold_to_cand = align(words, [t.orth_ for t in doc])
        annot_tuples = (range(len(words)), words, tags, heads, deps, entities)        self.orig_annot = list(zip(*annot_tuples))
        for i, gold_i in enumerate(self.cand_to_gold):            if doc[i].text.isspace():                self.words[i] = doc[i].text                self.tags[i] = '_SP'                self.heads[i] = None                self.labels[i] = None                self.ner[i] = 'O'            if gold_i is None:                pass            else:                self.words[i] = words[gold_i]                self.tags[i] = tags[gold_i]                if heads[gold_i] is None:                    self.heads[i] = None                else:                    self.heads[i] = self.gold_to_cand[heads[gold_i]]                self.labels[i] = deps[gold_i]                self.ner[i] = entities[gold_i]
        cycle = nonproj.contains_cycle(self.heads)        if cycle is not None:            raise ValueError(Errors.E069.format(cycle=cycle))
        if make_projective:            proj_heads, _ = nonproj.projectivize(self.heads, self.labels)            self.heads = proj_heads
    def __len__(self):        """Get the number of gold-standard tokens.

        RETURNS (int): The number of gold-standard tokens.        """
        return self.length
    @property    def is_projective(self):        """Whether the provided syntactic annotations form a projective
        dependency tree.        """
        return not nonproj.is_nonproj_tree(self.heads)
    @property    def sent_starts(self):        return [self.c.sent_start[i] for i in range(self.length)]

def biluo_tags_from_offsets(doc, entities, missing='O'):    """Encode labelled spans into per-token tags, using the
    Begin/In/Last/Unit/Out scheme (BILUO).
    doc (Doc): The document that the entity offsets refer to. The output tags        will refer to the token boundaries within the document.    entities (iterable): A sequence of `(start, end, label)` triples. `start`        and `end` should be character-offset integers denoting the slice into        the original string.    RETURNS (list): A list of unicode strings, describing the tags. Each tag        string will be of the form either "", "O" or "{action}-{label}", where        action is one of "B", "I", "L", "U". The string "-" is used where the        entity offsets don't align with the tokenization in the `Doc` object.        The training algorithm will view these as missing values. "O" denotes a        non-entity token. "B" denotes the beginning of a multi-token entity,        "I" the inside of an entity of three or more tokens, and "L" the end        of an entity of two or more tokens. "U" denotes a single-token entity.
    EXAMPLE:        >>> text = 'I like London.'        >>> entities = [(len('I like '), len('I like London'), 'LOC')]        >>> doc = nlp.tokenizer(text)        >>> tags = biluo_tags_from_offsets(doc, entities)        >>> assert tags == ['O', 'O', 'U-LOC', 'O']    """
    starts = {token.idx: token.i for token in doc}    ends = {token.idx+len(token): token.i for token in doc}    biluo = ['-' for _ in doc]    # Handle entity cases    for start_char, end_char, label in entities:        start_token = starts.get(start_char)        end_token = ends.get(end_char)        # Only interested if the tokenization is correct        if start_token is not None and end_token is not None:            if start_token == end_token:                biluo[start_token] = 'U-%s' % label            else:                biluo[start_token] = 'B-%s' % label                for i in range(start_token+1, end_token):                    biluo[i] = 'I-%s' % label                biluo[end_token] = 'L-%s' % label    # Now distinguish the O cases from ones where we miss the tokenization    entity_chars = set()    for start_char, end_char, label in entities:        for i in range(start_char, end_char):            entity_chars.add(i)    for token in doc:        for i in range(token.idx, token.idx+len(token)):            if i in entity_chars:                break        else:            biluo[token.i] = missing    return biluo

def offsets_from_biluo_tags(doc, tags):    """Encode per-token tags following the BILUO scheme into entity offsets.

    doc (Doc): The document that the BILUO tags refer to.    entities (iterable): A sequence of BILUO tags with each tag describing one        token. Each tags string will be of the form of either "", "O" or        "{action}-{label}", where action is one of "B", "I", "L", "U".    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and        `end` will be character-offset integers denoting the slice into the        original string.    """
    token_offsets = tags_to_entities(tags)    offsets = []    for label, start_idx, end_idx in token_offsets:        span = doc[start_idx : end_idx + 1]        offsets.append((span.start_char, span.end_char, label))    return offsets

def is_punct_label(label):    return label == 'P' or label.lower() == 'punct'