laywerrobot/lib/python3.6/site-packages/nltk/parse/bllip.py

# Natural Language Toolkit: Interface to BLLIP Parser
#
# Author: David McClosky <dmcc@bigasterisk.com>
#
# Copyright (C) 2001-2018 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

from __future__ import print_function

from nltk.parse.api import ParserI
from nltk.tree import Tree

"""
Interface for parsing with BLLIP Parser. Requires the Python
bllipparser module. BllipParser objects can be constructed with the
``BllipParser.from_unified_model_dir`` class method or manually using the
``BllipParser`` constructor. The former is generally easier if you have
a BLLIP Parser unified model directory -- a basic model can be obtained
from NLTK's downloader. More unified parsing models can be obtained with
BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher``
or see docs for ``bllipparser.ModelFetcher.download_and_install_model``).

Basic usage::

    # download and install a basic unified parsing model (Wall Street Journal)
    # sudo python -m nltk.downloader bllip_wsj_no_aux

    >>> from nltk.data import find
    >>> model_dir = find('models/bllip_wsj_no_aux').path
    >>> bllip = BllipParser.from_unified_model_dir(model_dir)

    # 1-best parsing
    >>> sentence1 = 'British left waffles on Falklands .'.split()
    >>> top_parse = bllip.parse_one(sentence1)
    >>> print(top_parse)
    (S1
      (S
        (NP (JJ British) (NN left))
        (VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands))))
        (. .)))

    # n-best parsing
    >>> sentence2 = 'Time flies'.split()
    >>> all_parses = bllip.parse_all(sentence2)
    >>> print(len(all_parses))
    50
    >>> print(all_parses[0])
    (S1 (S (NP (NNP Time)) (VP (VBZ flies))))

    # incorporating external tagging constraints (None means unconstrained tag)
    >>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')])
    >>> print(next(constrained1))
    (S1 (NP (VB Time) (NNS flies)))
    >>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)])
    >>> print(next(constrained2))
    (S1 (NP (NN Time) (VBZ flies)))

References
----------

- Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of
  the 1st North American chapter of the Association for Computational
  Linguistics conference. Association for Computational Linguistics,
  2000.

- Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing
  and MaxEnt discriminative reranking." Proceedings of the 43rd Annual
  Meeting on Association for Computational Linguistics. Association
  for Computational Linguistics, 2005.

Known issues
------------

Note that BLLIP Parser is not currently threadsafe. Since this module
uses a SWIG interface, it is potentially unsafe to create multiple
``BllipParser`` objects in the same process. BLLIP Parser currently
has issues with non-ASCII text and will raise an error if given any.

See http://pypi.python.org/pypi/bllipparser/ for more information
on BLLIP Parser's Python interface.
"""

__all__ = ['BllipParser']

# this block allows this module to be imported even if bllipparser isn't
# available
try:
    from bllipparser import RerankingParser
    from bllipparser.RerankingParser import get_unified_model_parameters

    def _ensure_bllip_import_or_error():
        pass
except ImportError as ie:
    def _ensure_bllip_import_or_error(ie=ie):
        raise ImportError("Couldn't import bllipparser module: %s" % ie)

def _ensure_ascii(words):
    try:
        for i, word in enumerate(words):
            word.decode('ascii')
    except UnicodeDecodeError:
        raise ValueError("Token %d (%r) is non-ASCII. BLLIP Parser "
                         "currently doesn't support non-ASCII inputs." %
                         (i, word))

def _scored_parse_to_nltk_tree(scored_parse):
    return Tree.fromstring(str(scored_parse.ptb_parse))

class BllipParser(ParserI):
    """
    Interface for parsing with BLLIP Parser. BllipParser objects can be
    constructed with the ``BllipParser.from_unified_model_dir`` class
    method or manually using the ``BllipParser`` constructor.
    """
    def __init__(self, parser_model=None, reranker_features=None,
                 reranker_weights=None, parser_options=None,
                 reranker_options=None):
        """
        Load a BLLIP Parser model from scratch. You'll typically want to
        use the ``from_unified_model_dir()`` class method to construct
        this object.

        :param parser_model: Path to parser model directory
        :type parser_model: str

        :param reranker_features: Path the reranker model's features file
        :type reranker_features: str

        :param reranker_weights: Path the reranker model's weights file
        :type reranker_weights: str

        :param parser_options: optional dictionary of parser options, see
        ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
        for more information.
        :type parser_options: dict(str)

        :param reranker_options: optional
        dictionary of reranker options, see
        ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
        for more information.
        :type reranker_options: dict(str)
        """
        _ensure_bllip_import_or_error()

        parser_options = parser_options or {}
        reranker_options = reranker_options or {}

        self.rrp = RerankingParser()
        self.rrp.load_parser_model(parser_model, **parser_options)
        if reranker_features and reranker_weights:
            self.rrp.load_reranker_model(features_filename=reranker_features,
                                         weights_filename=reranker_weights,
                                         **reranker_options)

    def parse(self, sentence):
        """
        Use BLLIP Parser to parse a sentence. Takes a sentence as a list
        of words; it will be automatically tagged with this BLLIP Parser
        instance's tagger.

        :return: An iterator that generates parse trees for the sentence
        from most likely to least likely.

        :param sentence: The sentence to be parsed
        :type sentence: list(str)
        :rtype: iter(Tree)
        """
        _ensure_ascii(sentence)
        nbest_list = self.rrp.parse(sentence)
        for scored_parse in nbest_list:
            yield _scored_parse_to_nltk_tree(scored_parse)

    def tagged_parse(self, word_and_tag_pairs):
        """
        Use BLLIP to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized
        and tagged. BLLIP will attempt to use the tags provided but may
        use others if it can't come up with a complete parse subject
        to those constraints. You may also specify a tag as ``None``
        to leave a token's tag unconstrained.

        :return: An iterator that generates parse trees for the sentence
        from most likely to least likely.

        :param sentence: Input sentence to parse as (word, tag) pairs
        :type sentence: list(tuple(str, str))
        :rtype: iter(Tree)
        """
        words = []
        tag_map = {}
        for i, (word, tag) in enumerate(word_and_tag_pairs):
            words.append(word)
            if tag is not None:
                tag_map[i] = tag

        _ensure_ascii(words)
        nbest_list = self.rrp.parse_tagged(words, tag_map)
        for scored_parse in nbest_list:
            yield _scored_parse_to_nltk_tree(scored_parse)

    @classmethod
    def from_unified_model_dir(this_class, model_dir, parser_options=None,
                               reranker_options=None):
        """
        Create a ``BllipParser`` object from a unified parsing model
        directory. Unified parsing model directories are a standardized
        way of storing BLLIP parser and reranker models together on disk.
        See ``bllipparser.RerankingParser.get_unified_model_parameters()``
        for more information about unified model directories.

        :return: A ``BllipParser`` object using the parser and reranker
        models in the model directory.

        :param model_dir: Path to the unified model directory.
        :type model_dir: str
        :param parser_options: optional dictionary of parser options, see
        ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
        for more information.
        :type parser_options: dict(str)
        :param reranker_options: optional dictionary of reranker options, see
        ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
        for more information.
        :type reranker_options: dict(str)
        :rtype: BllipParser
        """
        (parser_model_dir, reranker_features_filename,
         reranker_weights_filename) = get_unified_model_parameters(model_dir)
        return this_class(parser_model_dir, reranker_features_filename,
                          reranker_weights_filename, parser_options,
                          reranker_options)

def demo():
    """This assumes the Python module bllipparser is installed."""

    # download and install a basic unified parsing model (Wall Street Journal)
    # sudo python -m nltk.downloader bllip_wsj_no_aux

    from nltk.data import find
    model_dir = find('models/bllip_wsj_no_aux').path

    print('Loading BLLIP Parsing models...')
    # the easiest way to get started is to use a unified model
    bllip = BllipParser.from_unified_model_dir(model_dir)
    print('Done.')

    sentence1 = 'British left waffles on Falklands .'.split()
    sentence2 = 'I saw the man with the telescope .'.split()
    # this sentence is known to fail under the WSJ parsing model
    fail1 = '# ! ? : -'.split()
    for sentence in (sentence1, sentence2, fail1):
        print('Sentence: %r' % ' '.join(sentence))
        try:
            tree = next(bllip.parse(sentence))
            print(tree)
        except StopIteration:
            print("(parse failed)")

    # n-best parsing demo
    for i, parse in enumerate(bllip.parse(sentence1)):
        print('parse %d:\n%s' % (i, parse))

    # using external POS tag constraints
    print("forcing 'tree' to be 'NN':",
          next(bllip.tagged_parse([('A', None), ('tree', 'NN')])))
    print("forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
          next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])))
    # constraints don't have to make sense... (though on more complicated
    # sentences, they may cause the parse to fail)
    print("forcing 'A' to be 'NNP':",
          next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)])))

def setup_module(module):
    from nose import SkipTest

    try:
        _ensure_bllip_import_or_error()
    except ImportError:
        raise SkipTest('doctests from nltk.parse.bllip are skipped because '
                       'the bllipparser module is not installed')
first commit 2020-08-27 21:55:39 +02:00			`# Natural Language Toolkit: Interface to BLLIP Parser`
			`#`
			`# Author: David McClosky <dmcc@bigasterisk.com>`
			`#`
			`# Copyright (C) 2001-2018 NLTK Project`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`from __future__ import print_function`

			`from nltk.parse.api import ParserI`
			`from nltk.tree import Tree`

			`"""`
			`Interface for parsing with BLLIP Parser. Requires the Python`
			`bllipparser module. BllipParser objects can be constructed with the`
			``BllipParser.from_unified_model_dir`` class method or manually using the
			``BllipParser`` constructor. The former is generally easier if you have
			`a BLLIP Parser unified model directory -- a basic model can be obtained`
			`from NLTK's downloader. More unified parsing models can be obtained with`
			BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher``
			or see docs for ``bllipparser.ModelFetcher.download_and_install_model``).

			`Basic usage::`

			`# download and install a basic unified parsing model (Wall Street Journal)`
			`# sudo python -m nltk.downloader bllip_wsj_no_aux`

			`>>> from nltk.data import find`
			`>>> model_dir = find('models/bllip_wsj_no_aux').path`
			`>>> bllip = BllipParser.from_unified_model_dir(model_dir)`

			`# 1-best parsing`
			`>>> sentence1 = 'British left waffles on Falklands .'.split()`
			`>>> top_parse = bllip.parse_one(sentence1)`
			`>>> print(top_parse)`
			`(S1`
			`(S`
			`(NP (JJ British) (NN left))`
			`(VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands))))`
			`(. .)))`

			`# n-best parsing`
			`>>> sentence2 = 'Time flies'.split()`
			`>>> all_parses = bllip.parse_all(sentence2)`
			`>>> print(len(all_parses))`
			`50`
			`>>> print(all_parses[0])`
			`(S1 (S (NP (NNP Time)) (VP (VBZ flies))))`

			`# incorporating external tagging constraints (None means unconstrained tag)`
			`>>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')])`
			`>>> print(next(constrained1))`
			`(S1 (NP (VB Time) (NNS flies)))`
			`>>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)])`
			`>>> print(next(constrained2))`
			`(S1 (NP (NN Time) (VBZ flies)))`

			`References`
			`----------`

			`- Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of`
			`the 1st North American chapter of the Association for Computational`
			`Linguistics conference. Association for Computational Linguistics,`
			`2000.`

			`- Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing`
			`and MaxEnt discriminative reranking." Proceedings of the 43rd Annual`
			`Meeting on Association for Computational Linguistics. Association`
			`for Computational Linguistics, 2005.`

			`Known issues`
			`------------`

			`Note that BLLIP Parser is not currently threadsafe. Since this module`
			`uses a SWIG interface, it is potentially unsafe to create multiple`
			``BllipParser`` objects in the same process. BLLIP Parser currently
			`has issues with non-ASCII text and will raise an error if given any.`

			`See http://pypi.python.org/pypi/bllipparser/ for more information`
			`on BLLIP Parser's Python interface.`
			`"""`

			`__all__ = ['BllipParser']`

			`# this block allows this module to be imported even if bllipparser isn't`
			`# available`
			`try:`
			`from bllipparser import RerankingParser`
			`from bllipparser.RerankingParser import get_unified_model_parameters`

			`def _ensure_bllip_import_or_error():`
			`pass`
			`except ImportError as ie:`
			`def _ensure_bllip_import_or_error(ie=ie):`
			`raise ImportError("Couldn't import bllipparser module: %s" % ie)`

			`def _ensure_ascii(words):`
			`try:`
			`for i, word in enumerate(words):`
			`word.decode('ascii')`
			`except UnicodeDecodeError:`
			`raise ValueError("Token %d (%r) is non-ASCII. BLLIP Parser "`
			`"currently doesn't support non-ASCII inputs." %`
			`(i, word))`

			`def _scored_parse_to_nltk_tree(scored_parse):`
			`return Tree.fromstring(str(scored_parse.ptb_parse))`

			`class BllipParser(ParserI):`
			`"""`
			`Interface for parsing with BLLIP Parser. BllipParser objects can be`
			constructed with the ``BllipParser.from_unified_model_dir`` class
			method or manually using the ``BllipParser`` constructor.
			`"""`
			`def __init__(self, parser_model=None, reranker_features=None,`
			`reranker_weights=None, parser_options=None,`
			`reranker_options=None):`
			`"""`
			`Load a BLLIP Parser model from scratch. You'll typically want to`
			use the ``from_unified_model_dir()`` class method to construct
			`this object.`

			`:param parser_model: Path to parser model directory`
			`:type parser_model: str`

			`:param reranker_features: Path the reranker model's features file`
			`:type reranker_features: str`

			`:param reranker_weights: Path the reranker model's weights file`
			`:type reranker_weights: str`

			`:param parser_options: optional dictionary of parser options, see`
			``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
			`for more information.`
			`:type parser_options: dict(str)`

			`:param reranker_options: optional`
			`dictionary of reranker options, see`
			``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
			`for more information.`
			`:type reranker_options: dict(str)`
			`"""`
			`_ensure_bllip_import_or_error()`

			`parser_options = parser_options or {}`
			`reranker_options = reranker_options or {}`

			`self.rrp = RerankingParser()`
			`self.rrp.load_parser_model(parser_model, **parser_options)`
			`if reranker_features and reranker_weights:`
			`self.rrp.load_reranker_model(features_filename=reranker_features,`
			`weights_filename=reranker_weights,`
			`**reranker_options)`

			`def parse(self, sentence):`
			`"""`
			`Use BLLIP Parser to parse a sentence. Takes a sentence as a list`
			`of words; it will be automatically tagged with this BLLIP Parser`
			`instance's tagger.`

			`:return: An iterator that generates parse trees for the sentence`
			`from most likely to least likely.`

			`:param sentence: The sentence to be parsed`
			`:type sentence: list(str)`
			`:rtype: iter(Tree)`
			`"""`
			`_ensure_ascii(sentence)`
			`nbest_list = self.rrp.parse(sentence)`
			`for scored_parse in nbest_list:`
			`yield _scored_parse_to_nltk_tree(scored_parse)`

			`def tagged_parse(self, word_and_tag_pairs):`
			`"""`
			`Use BLLIP to parse a sentence. Takes a sentence as a list of`
			`(word, tag) tuples; the sentence must have already been tokenized`
			`and tagged. BLLIP will attempt to use the tags provided but may`
			`use others if it can't come up with a complete parse subject`
			to those constraints. You may also specify a tag as ``None``
			`to leave a token's tag unconstrained.`

			`:return: An iterator that generates parse trees for the sentence`
			`from most likely to least likely.`

			`:param sentence: Input sentence to parse as (word, tag) pairs`
			`:type sentence: list(tuple(str, str))`
			`:rtype: iter(Tree)`
			`"""`
			`words = []`
			`tag_map = {}`
			`for i, (word, tag) in enumerate(word_and_tag_pairs):`
			`words.append(word)`
			`if tag is not None:`
			`tag_map[i] = tag`

			`_ensure_ascii(words)`
			`nbest_list = self.rrp.parse_tagged(words, tag_map)`
			`for scored_parse in nbest_list:`
			`yield _scored_parse_to_nltk_tree(scored_parse)`

			`@classmethod`
			`def from_unified_model_dir(this_class, model_dir, parser_options=None,`
			`reranker_options=None):`
			`"""`
			Create a ``BllipParser`` object from a unified parsing model
			`directory. Unified parsing model directories are a standardized`
			`way of storing BLLIP parser and reranker models together on disk.`
			See ``bllipparser.RerankingParser.get_unified_model_parameters()``
			`for more information about unified model directories.`

			:return: A ``BllipParser`` object using the parser and reranker
			`models in the model directory.`

			`:param model_dir: Path to the unified model directory.`
			`:type model_dir: str`
			`:param parser_options: optional dictionary of parser options, see`
			``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
			`for more information.`
			`:type parser_options: dict(str)`
			`:param reranker_options: optional dictionary of reranker options, see`
			``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
			`for more information.`
			`:type reranker_options: dict(str)`
			`:rtype: BllipParser`
			`"""`
			`(parser_model_dir, reranker_features_filename,`
			`reranker_weights_filename) = get_unified_model_parameters(model_dir)`
			`return this_class(parser_model_dir, reranker_features_filename,`
			`reranker_weights_filename, parser_options,`
			`reranker_options)`

			`def demo():`
			`"""This assumes the Python module bllipparser is installed."""`

			`# download and install a basic unified parsing model (Wall Street Journal)`
			`# sudo python -m nltk.downloader bllip_wsj_no_aux`

			`from nltk.data import find`
			`model_dir = find('models/bllip_wsj_no_aux').path`

			`print('Loading BLLIP Parsing models...')`
			`# the easiest way to get started is to use a unified model`
			`bllip = BllipParser.from_unified_model_dir(model_dir)`
			`print('Done.')`

			`sentence1 = 'British left waffles on Falklands .'.split()`
			`sentence2 = 'I saw the man with the telescope .'.split()`
			`# this sentence is known to fail under the WSJ parsing model`
			`fail1 = '# ! ? : -'.split()`
			`for sentence in (sentence1, sentence2, fail1):`
			`print('Sentence: %r' % ' '.join(sentence))`
			`try:`
			`tree = next(bllip.parse(sentence))`
			`print(tree)`
			`except StopIteration:`
			`print("(parse failed)")`

			`# n-best parsing demo`
			`for i, parse in enumerate(bllip.parse(sentence1)):`
			`print('parse %d:\n%s' % (i, parse))`

			`# using external POS tag constraints`
			`print("forcing 'tree' to be 'NN':",`
			`next(bllip.tagged_parse([('A', None), ('tree', 'NN')])))`
			`print("forcing 'A' to be 'DT' and 'tree' to be 'NNP':",`
			`next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])))`
			`# constraints don't have to make sense... (though on more complicated`
			`# sentences, they may cause the parse to fail)`
			`print("forcing 'A' to be 'NNP':",`
			`next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)])))`

			`def setup_module(module):`
			`from nose import SkipTest`

			`try:`
			`_ensure_bllip_import_or_error()`
			`except ImportError:`
			`raise SkipTest('doctests from nltk.parse.bllip are skipped because '`
			`'the bllipparser module is not installed')`