alpcentaur
/
basabuuka_prototyp


								# cython: profile=True

								# coding: utf8

								from __future__ import unicode_literals, print_function


								import re

								import ujson

								import random

								import cytoolz

								import itertools


								from .syntax import nonproj

								from .tokens import Doc

								from .errors import Errors

								from . import util

								from .util import minibatch


								def tags_to_entities(tags):

								    entities = []

								    start = None

								    for i, tag in enumerate(tags):

								        if tag is None:

								            continue

								        if tag.startswith('O'):

								            # TODO: We shouldn't be getting these malformed inputs. Fix this.

								            if start is not None:

								                start = None

								            continue

								        elif tag == '-':

								            continue

								        elif tag.startswith('I'):

								            if start is None:

								                raise ValueError(Errors.E067.format(tags=tags[:i+1]))

								            continue

								        if tag.startswith('U'):

								            entities.append((tag[2:], i, i))

								        elif tag.startswith('B'):

								            start = i

								        elif tag.startswith('L'):

								            entities.append((tag[2:], start, i))

								            start = None

								        else:

								            raise ValueError(Errors.E068.format(tag=tag))

								    return entities


								def merge_sents(sents):

								    m_deps = [[], [], [], [], [], []]

								    m_brackets = []

								    i = 0

								    for (ids, words, tags, heads, labels, ner), brackets in sents:

								        m_deps[0].extend(id_ + i for id_ in ids)

								        m_deps[1].extend(words)

								        m_deps[2].extend(tags)

								        m_deps[3].extend(head + i for head in heads)

								        m_deps[4].extend(labels)

								        m_deps[5].extend(ner)

								        m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])

								                          for b in brackets)

								        i += len(ids)

								    return [(m_deps, m_brackets)]


								def align(cand_words, gold_words):

								    cost, edit_path = _min_edit_path(cand_words, gold_words)

								    alignment = []

								    i_of_gold = 0

								    for move in edit_path:

								        if move == 'M':

								            alignment.append(i_of_gold)

								            i_of_gold += 1

								        elif move == 'S':

								            alignment.append(None)

								            i_of_gold += 1

								        elif move == 'D':

								            alignment.append(None)

								        elif move == 'I':

								            i_of_gold += 1

								        else:

								            raise Exception(move)

								    return alignment


								punct_re = re.compile(r'\W')


								def _min_edit_path(cand_words, gold_words):

								    cdef:

								        Pool mem

								        int i, j, n_cand, n_gold

								        int* curr_costs

								        int* prev_costs


								    # TODO: Fix this --- just do it properly, make the full edit matrix and

								    # then walk back over it...

								    # Preprocess inputs

								    cand_words = [punct_re.sub('', w).lower() for w in cand_words]

								    gold_words = [punct_re.sub('', w).lower() for w in gold_words]


								    if cand_words == gold_words:

								        return 0, ''.join(['M' for _ in gold_words])

								    mem = Pool()

								    n_cand = len(cand_words)

								    n_gold = len(gold_words)

								    # Levenshtein distance, except we need the history, and we may want

								    # different costs. Mark operations with a string, and score the history

								    # using _edit_cost.

								    previous_row = []

								    prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))

								    curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))

								    for i in range(n_gold + 1):

								        cell = ''

								        for j in range(i):

								            cell += 'I'

								        previous_row.append('I' * i)

								        prev_costs[i] = i

								    for i, cand in enumerate(cand_words):

								        current_row = ['D' * (i + 1)]

								        curr_costs[0] = i+1

								        for j, gold in enumerate(gold_words):

								            if gold.lower() == cand.lower():

								                s_cost = prev_costs[j]

								                i_cost = curr_costs[j] + 1

								                d_cost = prev_costs[j + 1] + 1

								            else:

								                s_cost = prev_costs[j] + 1

								                i_cost = curr_costs[j] + 1

								                d_cost = prev_costs[j + 1] + (1 if cand else 0)


								            if s_cost <= i_cost and s_cost <= d_cost:

								                best_cost = s_cost

								                best_hist = previous_row[j] + ('M' if gold == cand else 'S')

								            elif i_cost <= s_cost and i_cost <= d_cost:

								                best_cost = i_cost

								                best_hist = current_row[j] + 'I'

								            else:

								                best_cost = d_cost

								                best_hist = previous_row[j + 1] + 'D'


								            current_row.append(best_hist)

								            curr_costs[j+1] = best_cost

								        previous_row = current_row

								        for j in range(len(gold_words) + 1):

								            prev_costs[j] = curr_costs[j]

								            curr_costs[j] = 0


								    return prev_costs[n_gold], previous_row[-1]


								class GoldCorpus(object):

								    """An annotated corpus, using the JSON file format. Manages

								    annotations for tagging, dependency parsing and NER."""

								    def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):

								        """Create a GoldCorpus.


								        train_path (unicode or Path): File or directory of training data.

								        dev_path (unicode or Path): File or directory of development data.

								        RETURNS (GoldCorpus): The newly created object.

								        """

								        self.train_path = util.ensure_path(train_path)

								        self.dev_path = util.ensure_path(dev_path)

								        self.limit = limit

								        self.train_locs = self.walk_corpus(self.train_path)

								        self.dev_locs = self.walk_corpus(self.dev_path)


								    @property

								    def train_tuples(self):

								        i = 0

								        for loc in self.train_locs:

								            gold_tuples = read_json_file(loc)

								            for item in gold_tuples:

								                yield item

								                i += len(item[1])

								                if self.limit and i >= self.limit:

								                    break


								    @property

								    def dev_tuples(self):

								        i = 0

								        for loc in self.dev_locs:

								            gold_tuples = read_json_file(loc)

								            for item in gold_tuples:

								                yield item

								                i += len(item[1])

								                if self.limit and i >= self.limit:

								                    break


								    def count_train(self):

								        n = 0

								        i = 0

								        for raw_text, paragraph_tuples in self.train_tuples:

								            n += sum([len(s[0][1]) for s in paragraph_tuples])

								            if self.limit and i >= self.limit:

								                break

								            i += len(paragraph_tuples)

								        return n


								    def train_docs(self, nlp, gold_preproc=False,

								                   projectivize=False, max_length=None,

								                   noise_level=0.0):

								        train_tuples = self.train_tuples

								        if projectivize:

								            train_tuples = nonproj.preprocess_training_data(

								                self.train_tuples, label_freq_cutoff=100)

								        random.shuffle(train_tuples)

								        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,

								                                        max_length=max_length,

								                                        noise_level=noise_level)

								        yield from gold_docs


								    def dev_docs(self, nlp, gold_preproc=False):

								        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)

								        yield from gold_docs


								    @classmethod

								    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,

								                       noise_level=0.0):

								        for raw_text, paragraph_tuples in tuples:

								            if gold_preproc:

								                raw_text = None

								            else:

								                paragraph_tuples = merge_sents(paragraph_tuples)

								            docs = cls._make_docs(nlp, raw_text, paragraph_tuples,

								                                  gold_preproc, noise_level=noise_level)

								            golds = cls._make_golds(docs, paragraph_tuples)

								            for doc, gold in zip(docs, golds):

								                if (not max_length) or len(doc) < max_length:

								                    yield doc, gold


								    @classmethod

								    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,

								                   noise_level=0.0):

								        if raw_text is not None:

								            raw_text = add_noise(raw_text, noise_level)

								            return [nlp.make_doc(raw_text)]

								        else:

								            return [Doc(nlp.vocab,

								                        words=add_noise(sent_tuples[1], noise_level))

								                    for (sent_tuples, brackets) in paragraph_tuples]


								    @classmethod

								    def _make_golds(cls, docs, paragraph_tuples):

								        if len(docs) != len(paragraph_tuples):

								            raise ValueError(Errors.E070.format(n_docs=len(docs),

								                                                n_annots=len(paragraph_tuples)))

								        if len(docs) == 1:

								            return [GoldParse.from_annot_tuples(docs[0],

								                                                paragraph_tuples[0][0])]

								        else:

								            return [GoldParse.from_annot_tuples(doc, sent_tuples)

								                    for doc, (sent_tuples, brackets)

								                    in zip(docs, paragraph_tuples)]


								    @staticmethod

								    def walk_corpus(path):

								        if not path.is_dir():

								            return [path]

								        paths = [path]

								        locs = []

								        seen = set()

								        for path in paths:

								            if str(path) in seen:

								                continue

								            seen.add(str(path))

								            if path.parts[-1].startswith('.'):

								                continue

								            elif path.is_dir():

								                paths.extend(path.iterdir())

								            elif path.parts[-1].endswith('.json'):

								                locs.append(path)

								        return locs


								def add_noise(orig, noise_level):

								    if random.random() >= noise_level:

								        return orig

								    elif type(orig) == list:

								        corrupted = [_corrupt(word, noise_level) for word in orig]

								        corrupted = [w for w in corrupted if w]

								        return corrupted

								    else:

								        return ''.join(_corrupt(c, noise_level) for c in orig)


								def _corrupt(c, noise_level):

								    if random.random() >= noise_level:

								        return c

								    elif c == ' ':

								        return '\n'

								    elif c == '\n':

								        return ' '

								    elif c in ['.', "'", "!", "?"]:

								        return ''

								    else:

								        return c.lower()


								def read_json_file(loc, docs_filter=None, limit=None):

								    loc = util.ensure_path(loc)

								    if loc.is_dir():

								        for filename in loc.iterdir():

								            yield from read_json_file(loc / filename, limit=limit)

								    else:

								        with loc.open('r', encoding='utf8') as file_:

								            docs = ujson.load(file_)

								        if limit is not None:

								            docs = docs[:limit]

								        for doc in docs:

								            if docs_filter is not None and not docs_filter(doc):

								                continue

								            paragraphs = []

								            for paragraph in doc['paragraphs']:

								                sents = []

								                for sent in paragraph['sentences']:

								                    words = []

								                    ids = []

								                    tags = []

								                    heads = []

								                    labels = []

								                    ner = []

								                    for i, token in enumerate(sent['tokens']):

								                        words.append(token['orth'])

								                        ids.append(i)

								                        tags.append(token.get('tag', '-'))

								                        heads.append(token.get('head', 0) + i)

								                        labels.append(token.get('dep', ''))

								                        # Ensure ROOT label is case-insensitive

								                        if labels[-1].lower() == 'root':

								                            labels[-1] = 'ROOT'

								                        ner.append(token.get('ner', '-'))

								                    sents.append([

								                        [ids, words, tags, heads, labels, ner],

								                        sent.get('brackets', [])])

								                if sents:

								                    yield [paragraph.get('raw', None), sents]


								def iob_to_biluo(tags):

								    out = []

								    curr_label = None

								    tags = list(tags)

								    while tags:

								        out.extend(_consume_os(tags))

								        out.extend(_consume_ent(tags))

								    return out


								def _consume_os(tags):

								    while tags and tags[0] == 'O':

								        yield tags.pop(0)


								def _consume_ent(tags):

								    if not tags:

								        return []

								    tag = tags.pop(0)

								    target_in = 'I' + tag[1:]

								    target_last = 'L' + tag[1:]

								    length = 1

								    while tags and tags[0] in {target_in, target_last}:

								        length += 1

								        tags.pop(0)

								    label = tag[2:]

								    if length == 1:

								        return ['U-' + label]

								    else:

								        start = 'B-' + label

								        end = 'L-' + label

								        middle = ['I-%s' % label for _ in range(1, length - 1)]

								        return [start] + middle + [end]


								cdef class GoldParse:

								    """Collection for training annotations."""

								    @classmethod

								    def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):

								        _, words, tags, heads, deps, entities = annot_tuples

								        return cls(doc, words=words, tags=tags, heads=heads, deps=deps,

								                   entities=entities, make_projective=make_projective)


								    def __init__(self, doc, annot_tuples=None, words=None, tags=None,

								                 heads=None, deps=None, entities=None, make_projective=False,

								                 cats=None):

								        """Create a GoldParse.


								        doc (Doc): The document the annotations refer to.

								        words (iterable): A sequence of unicode word strings.

								        tags (iterable): A sequence of strings, representing tag annotations.

								        heads (iterable): A sequence of integers, representing syntactic

								            head offsets.

								        deps (iterable): A sequence of strings, representing the syntactic

								            relation types.

								        entities (iterable): A sequence of named entity annotations, either as

								            BILUO tag strings, or as `(start_char, end_char, label)` tuples,

								            representing the entity positions.

								        cats (dict): Labels for text classification. Each key in the dictionary

								            may be a string or an int, or a `(start_char, end_char, label)`

								            tuple, indicating that the label is applied to only part of the

								            document (usually a sentence). Unlike entity annotations, label

								            annotations can overlap, i.e. a single word can be covered by

								            multiple labelled spans. The TextCategorizer component expects

								            true examples of a label to have the value 1.0, and negative

								            examples of a label to have the value 0.0. Labels not in the

								            dictionary are treated as missing - the gradient for those labels

								            will be zero.

								        RETURNS (GoldParse): The newly constructed object.

								        """

								        if words is None:

								            words = [token.text for token in doc]

								        if tags is None:

								            tags = [None for _ in doc]

								        if heads is None:

								            heads = [None for token in doc]

								        if deps is None:

								            deps = [None for _ in doc]

								        if entities is None:

								            entities = [None for _ in doc]

								        elif len(entities) == 0:

								            entities = ['O' for _ in doc]

								        elif not isinstance(entities[0], basestring):

								            # Assume we have entities specified by character offset.

								            entities = biluo_tags_from_offsets(doc, entities)


								        self.mem = Pool()

								        self.loss = 0

								        self.length = len(doc)


								        # These are filled by the tagger/parser/entity recogniser

								        self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))

								        self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))

								        self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))

								        self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))

								        self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))

								        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))


								        self.cats = {} if cats is None else dict(cats)

								        self.words = [None] * len(doc)

								        self.tags = [None] * len(doc)

								        self.heads = [None] * len(doc)

								        self.labels = [None] * len(doc)

								        self.ner = [None] * len(doc)


								        self.cand_to_gold = align([t.orth_ for t in doc], words)

								        self.gold_to_cand = align(words, [t.orth_ for t in doc])


								        annot_tuples = (range(len(words)), words, tags, heads, deps, entities)

								        self.orig_annot = list(zip(*annot_tuples))


								        for i, gold_i in enumerate(self.cand_to_gold):

								            if doc[i].text.isspace():

								                self.words[i] = doc[i].text

								                self.tags[i] = '_SP'

								                self.heads[i] = None

								                self.labels[i] = None

								                self.ner[i] = 'O'

								            if gold_i is None:

								                pass

								            else:

								                self.words[i] = words[gold_i]

								                self.tags[i] = tags[gold_i]

								                if heads[gold_i] is None:

								                    self.heads[i] = None

								                else:

								                    self.heads[i] = self.gold_to_cand[heads[gold_i]]

								                self.labels[i] = deps[gold_i]

								                self.ner[i] = entities[gold_i]


								        cycle = nonproj.contains_cycle(self.heads)

								        if cycle is not None:

								            raise ValueError(Errors.E069.format(cycle=cycle))


								        if make_projective:

								            proj_heads, _ = nonproj.projectivize(self.heads, self.labels)

								            self.heads = proj_heads


								    def __len__(self):

								        """Get the number of gold-standard tokens.


								        RETURNS (int): The number of gold-standard tokens.

								        """

								        return self.length


								    @property

								    def is_projective(self):

								        """Whether the provided syntactic annotations form a projective

								        dependency tree.

								        """

								        return not nonproj.is_nonproj_tree(self.heads)


								    @property

								    def sent_starts(self):

								        return [self.c.sent_start[i] for i in range(self.length)]


								def biluo_tags_from_offsets(doc, entities, missing='O'):

								    """Encode labelled spans into per-token tags, using the

								    Begin/In/Last/Unit/Out scheme (BILUO).


								    doc (Doc): The document that the entity offsets refer to. The output tags

								        will refer to the token boundaries within the document.

								    entities (iterable): A sequence of `(start, end, label)` triples. `start`

								        and `end` should be character-offset integers denoting the slice into

								        the original string.

								    RETURNS (list): A list of unicode strings, describing the tags. Each tag

								        string will be of the form either "", "O" or "{action}-{label}", where

								        action is one of "B", "I", "L", "U". The string "-" is used where the

								        entity offsets don't align with the tokenization in the `Doc` object.

								        The training algorithm will view these as missing values. "O" denotes a

								        non-entity token. "B" denotes the beginning of a multi-token entity,

								        "I" the inside of an entity of three or more tokens, and "L" the end

								        of an entity of two or more tokens. "U" denotes a single-token entity.


								    EXAMPLE:

								        >>> text = 'I like London.'

								        >>> entities = [(len('I like '), len('I like London'), 'LOC')]

								        >>> doc = nlp.tokenizer(text)

								        >>> tags = biluo_tags_from_offsets(doc, entities)

								        >>> assert tags == ['O', 'O', 'U-LOC', 'O']

								    """

								    starts = {token.idx: token.i for token in doc}

								    ends = {token.idx+len(token): token.i for token in doc}

								    biluo = ['-' for _ in doc]

								    # Handle entity cases

								    for start_char, end_char, label in entities:

								        start_token = starts.get(start_char)

								        end_token = ends.get(end_char)

								        # Only interested if the tokenization is correct

								        if start_token is not None and end_token is not None:

								            if start_token == end_token:

								                biluo[start_token] = 'U-%s' % label

								            else:

								                biluo[start_token] = 'B-%s' % label

								                for i in range(start_token+1, end_token):

								                    biluo[i] = 'I-%s' % label

								                biluo[end_token] = 'L-%s' % label

								    # Now distinguish the O cases from ones where we miss the tokenization

								    entity_chars = set()

								    for start_char, end_char, label in entities:

								        for i in range(start_char, end_char):

								            entity_chars.add(i)

								    for token in doc:

								        for i in range(token.idx, token.idx+len(token)):

								            if i in entity_chars:

								                break

								        else:

								            biluo[token.i] = missing

								    return biluo


								def offsets_from_biluo_tags(doc, tags):

								    """Encode per-token tags following the BILUO scheme into entity offsets.


								    doc (Doc): The document that the BILUO tags refer to.

								    entities (iterable): A sequence of BILUO tags with each tag describing one

								        token. Each tags string will be of the form of either "", "O" or

								        "{action}-{label}", where action is one of "B", "I", "L", "U".

								    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and

								        `end` will be character-offset integers denoting the slice into the

								        original string.

								    """

								    token_offsets = tags_to_entities(tags)

								    offsets = []

								    for label, start_idx, end_idx in token_offsets:

								        span = doc[start_idx : end_idx + 1]

								        offsets.append((span.start_char, span.end_char, label))

								    return offsets


								def is_punct_label(label):

								    return label == 'P' or label.lower() == 'punct'