laywerrobot/lib/python3.6/site-packages/nltk/parse/util.py

# Natural Language Toolkit: Parser Utility Functions
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
#
# Copyright (C) 2001-2018 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT


"""
Utility functions for parsers.
"""
from __future__ import print_function

from nltk.grammar import CFG, FeatureGrammar, PCFG
from nltk.data import load

from nltk.parse.chart import Chart, ChartParser
from nltk.parse.pchart import InsideChartParser
from nltk.parse.featurechart import FeatureChart, FeatureChartParser

def load_parser(grammar_url, trace=0,
                parser=None, chart_class=None,
                beam_size=0, **load_args):
    """
    Load a grammar from a file, and build a parser based on that grammar.
    The parser depends on the grammar format, and might also depend
    on properties of the grammar itself.

    The following grammar formats are currently supported:
      - ``'cfg'``  (CFGs: ``CFG``)
      - ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
      - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)

    :type grammar_url: str
    :param grammar_url: A URL specifying where the grammar is located.
        The default protocol is ``"nltk:"``, which searches for the file
        in the the NLTK data package.
    :type trace: int
    :param trace: The level of tracing that should be used when
        parsing a text.  ``0`` will generate no tracing output;
        and higher numbers will produce more verbose tracing output.
    :param parser: The class used for parsing; should be ``ChartParser``
        or a subclass.
        If None, the class depends on the grammar format.
    :param chart_class: The class used for storing the chart;
        should be ``Chart`` or a subclass.
        Only used for CFGs and feature CFGs.
        If None, the chart class depends on the grammar format.
    :type beam_size: int
    :param beam_size: The maximum length for the parser's edge queue.
        Only used for probabilistic CFGs.
    :param load_args: Keyword parameters used when loading the grammar.
        See ``data.load`` for more information.
    """
    grammar = load(grammar_url, **load_args)
    if not isinstance(grammar, CFG):
        raise ValueError("The grammar must be a CFG, "
                         "or a subclass thereof.")
    if isinstance(grammar, PCFG):
        if parser is None:
            parser = InsideChartParser
        return parser(grammar, trace=trace, beam_size=beam_size)

    elif isinstance(grammar, FeatureGrammar):
        if parser is None:
            parser = FeatureChartParser
        if chart_class is None:
            chart_class = FeatureChart
        return parser(grammar, trace=trace, chart_class=chart_class)

    else: # Plain CFG.
        if parser is None:
            parser = ChartParser
        if chart_class is None:
            chart_class = Chart
        return parser(grammar, trace=trace, chart_class=chart_class)

def taggedsent_to_conll(sentence):
	"""
	A module to convert a single POS tagged sentence into CONLL format.
	
	>>> from nltk import word_tokenize, pos_tag
	>>> text = "This is a foobar sentence."
	>>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))):
	... 	print(line, end="")
        1	This	_	DT	DT	_	0	a	_	_
        2	is	_	VBZ	VBZ	_	0	a	_	_
        3	a	_	DT	DT	_	0	a	_	_
        4	foobar	_	JJ	JJ	_	0	a	_	_
        5	sentence	_	NN	NN	_	0	a	_	_
        6	.		_	.	.	_	0	a	_	_
	
	:param sentence: A single input sentence to parse
	:type sentence: list(tuple(str, str))
	:rtype: iter(str) 
	:return: a generator yielding a single sentence in CONLL format.
	"""
	for (i, (word, tag)) in enumerate(sentence, start=1):
		input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_']
		input_str = "\t".join(input_str) + "\n"
		yield input_str


def taggedsents_to_conll(sentences):
	"""
	A module to convert the a POS tagged document stream
	(i.e. list of list of tuples, a list of sentences) and yield lines 
	in CONLL format. This module yields one line per word and two newlines 
	for end of sentence. 

	>>> from nltk import word_tokenize, sent_tokenize, pos_tag
	>>> text = "This is a foobar sentence. Is that right?"
	>>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
	>>> for line in taggedsents_to_conll(sentences):
        ...     if line:
	...         print(line, end="")
        1	This	_	DT	DT	_	0	a	_	_
        2	is	_	VBZ	VBZ	_	0	a	_	_
        3	a	_	DT	DT	_	0	a	_	_
        4	foobar	_	JJ	JJ	_	0	a	_	_
        5	sentence	_	NN	NN	_	0	a	_	_
        6	.		_	.	.	_	0	a	_	_
        <BLANKLINE>
        <BLANKLINE>
        1	Is	_	VBZ	VBZ	_	0	a	_	_
        2	that	_	IN	IN	_	0	a	_	_
        3	right	_	NN	NN	_	0	a	_	_
        4	?	_	.	.	_	0	a	_	_
        <BLANKLINE>
        <BLANKLINE>

	:param sentences: Input sentences to parse
	:type sentence: list(list(tuple(str, str)))
	:rtype: iter(str) 
	:return: a generator yielding sentences in CONLL format.
	"""
	for sentence in sentences:
		for input_str in taggedsent_to_conll(sentence):
			yield input_str
		yield '\n\n'		

######################################################################
#{ Test Suites
######################################################################

class TestGrammar(object):
    """
    Unit tests for  CFG.
    """
    def __init__(self, grammar, suite, accept=None, reject=None):
        self.test_grammar = grammar

        self.cp = load_parser(grammar, trace=0)
        self.suite = suite
        self._accept = accept
        self._reject = reject


    def run(self, show_trees=False):
        """
        Sentences in the test suite are divided into two classes:
         - grammatical (``accept``) and
         - ungrammatical (``reject``).
        If a sentence should parse accordng to the grammar, the value of
        ``trees`` will be a non-empty list. If a sentence should be rejected
        according to the grammar, then the value of ``trees`` will be None.
        """
        for test in self.suite:
            print(test['doc'] + ":", end=' ')
            for key in ['accept', 'reject']:
                for sent in test[key]:
                    tokens = sent.split()
                    trees = list(self.cp.parse(tokens))
                    if show_trees and trees:
                        print()
                        print(sent)
                        for tree in trees:
                            print(tree)
                    if key == 'accept':
                        if trees == []:
                            raise ValueError("Sentence '%s' failed to parse'" % sent)
                        else:
                            accepted = True
                    else:
                        if trees:
                            raise ValueError("Sentence '%s' received a parse'" % sent)
                        else:
                            rejected = True
            if accepted and rejected:
                print("All tests passed!")

def extract_test_sentences(string, comment_chars="#%;", encoding=None):
    """
    Parses a string with one test sentence per line.
    Lines can optionally begin with:
      - a bool, saying if the sentence is grammatical or not, or
      - an int, giving the number of parse trees is should have,
    The result information is followed by a colon, and then the sentence.
    Empty lines and lines beginning with a comment char are ignored.

    :return: a list of tuple of sentences and expected results,
        where a sentence is a list of str,
        and a result is None, or bool, or int

    :param comment_chars: ``str`` of possible comment characters.
    :param encoding: the encoding of the string, if it is binary
    """
    if encoding is not None:
        string = string.decode(encoding)
    sentences = []
    for sentence in string.split('\n'):
        if sentence == '' or sentence[0] in comment_chars:
            continue
        split_info = sentence.split(':', 1)
        result = None
        if len(split_info) == 2:
            if split_info[0] in ['True','true','False','false']:
                result = split_info[0] in ['True','true']
                sentence = split_info[1]
            else:
                result = int(split_info[0])
                sentence = split_info[1]
        tokens = sentence.split()
        if tokens == []:
            continue
        sentences += [(tokens, result)]
    return sentences

# nose thinks it is a test
extract_test_sentences.__test__ = False
first commit 2020-08-27 21:55:39 +02:00			`# Natural Language Toolkit: Parser Utility Functions`
			`#`
			`# Author: Ewan Klein <ewan@inf.ed.ac.uk>`
			`#`
			`# Copyright (C) 2001-2018 NLTK Project`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`


			`"""`
			`Utility functions for parsers.`
			`"""`
			`from __future__ import print_function`

			`from nltk.grammar import CFG, FeatureGrammar, PCFG`
			`from nltk.data import load`

			`from nltk.parse.chart import Chart, ChartParser`
			`from nltk.parse.pchart import InsideChartParser`
			`from nltk.parse.featurechart import FeatureChart, FeatureChartParser`

			`def load_parser(grammar_url, trace=0,`
			`parser=None, chart_class=None,`
			`beam_size=0, **load_args):`
			`"""`
			`Load a grammar from a file, and build a parser based on that grammar.`
			`The parser depends on the grammar format, and might also depend`
			`on properties of the grammar itself.`

			`The following grammar formats are currently supported:`
			- ``'cfg'`` (CFGs: ``CFG``)
			- ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
			- ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)

			`:type grammar_url: str`
			`:param grammar_url: A URL specifying where the grammar is located.`
			The default protocol is ``"nltk:"``, which searches for the file
			`in the the NLTK data package.`
			`:type trace: int`
			`:param trace: The level of tracing that should be used when`
			parsing a text. ``0`` will generate no tracing output;
			`and higher numbers will produce more verbose tracing output.`
			:param parser: The class used for parsing; should be ``ChartParser``
			`or a subclass.`
			`If None, the class depends on the grammar format.`
			`:param chart_class: The class used for storing the chart;`
			should be ``Chart`` or a subclass.
			`Only used for CFGs and feature CFGs.`
			`If None, the chart class depends on the grammar format.`
			`:type beam_size: int`
			`:param beam_size: The maximum length for the parser's edge queue.`
			`Only used for probabilistic CFGs.`
			`:param load_args: Keyword parameters used when loading the grammar.`
			See ``data.load`` for more information.
			`"""`
			`grammar = load(grammar_url, **load_args)`
			`if not isinstance(grammar, CFG):`
			`raise ValueError("The grammar must be a CFG, "`
			`"or a subclass thereof.")`
			`if isinstance(grammar, PCFG):`
			`if parser is None:`
			`parser = InsideChartParser`
			`return parser(grammar, trace=trace, beam_size=beam_size)`

			`elif isinstance(grammar, FeatureGrammar):`
			`if parser is None:`
			`parser = FeatureChartParser`
			`if chart_class is None:`
			`chart_class = FeatureChart`
			`return parser(grammar, trace=trace, chart_class=chart_class)`

			`else: # Plain CFG.`
			`if parser is None:`
			`parser = ChartParser`
			`if chart_class is None:`
			`chart_class = Chart`
			`return parser(grammar, trace=trace, chart_class=chart_class)`

			`def taggedsent_to_conll(sentence):`
			`"""`
			`A module to convert a single POS tagged sentence into CONLL format.`

			`>>> from nltk import word_tokenize, pos_tag`
			`>>> text = "This is a foobar sentence."`
			`>>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))):`
			`... print(line, end="")`
			`1 This _ DT DT _ 0 a _ _`
			`2 is _ VBZ VBZ _ 0 a _ _`
			`3 a _ DT DT _ 0 a _ _`
			`4 foobar _ JJ JJ _ 0 a _ _`
			`5 sentence _ NN NN _ 0 a _ _`
			`6 . _ . . _ 0 a _ _`

			`:param sentence: A single input sentence to parse`
			`:type sentence: list(tuple(str, str))`
			`:rtype: iter(str)`
			`:return: a generator yielding a single sentence in CONLL format.`
			`"""`
			`for (i, (word, tag)) in enumerate(sentence, start=1):`
			`input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_']`
			`input_str = "\t".join(input_str) + "\n"`
			`yield input_str`


			`def taggedsents_to_conll(sentences):`
			`"""`
			`A module to convert the a POS tagged document stream`
			`(i.e. list of list of tuples, a list of sentences) and yield lines`
			`in CONLL format. This module yields one line per word and two newlines`
			`for end of sentence.`

			`>>> from nltk import word_tokenize, sent_tokenize, pos_tag`
			`>>> text = "This is a foobar sentence. Is that right?"`
			`>>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]`
			`>>> for line in taggedsents_to_conll(sentences):`
			`... if line:`
			`... print(line, end="")`
			`1 This _ DT DT _ 0 a _ _`
			`2 is _ VBZ VBZ _ 0 a _ _`
			`3 a _ DT DT _ 0 a _ _`
			`4 foobar _ JJ JJ _ 0 a _ _`
			`5 sentence _ NN NN _ 0 a _ _`
			`6 . _ . . _ 0 a _ _`
			`<BLANKLINE>`
			`<BLANKLINE>`
			`1 Is _ VBZ VBZ _ 0 a _ _`
			`2 that _ IN IN _ 0 a _ _`
			`3 right _ NN NN _ 0 a _ _`
			`4 ? _ . . _ 0 a _ _`
			`<BLANKLINE>`
			`<BLANKLINE>`

			`:param sentences: Input sentences to parse`
			`:type sentence: list(list(tuple(str, str)))`
			`:rtype: iter(str)`
			`:return: a generator yielding sentences in CONLL format.`
			`"""`
			`for sentence in sentences:`
			`for input_str in taggedsent_to_conll(sentence):`
			`yield input_str`
			`yield '\n\n'`

			`######################################################################`
			`#{ Test Suites`
			`######################################################################`

			`class TestGrammar(object):`
			`"""`
			`Unit tests for CFG.`
			`"""`
			`def __init__(self, grammar, suite, accept=None, reject=None):`
			`self.test_grammar = grammar`

			`self.cp = load_parser(grammar, trace=0)`
			`self.suite = suite`
			`self._accept = accept`
			`self._reject = reject`


			`def run(self, show_trees=False):`
			`"""`
			`Sentences in the test suite are divided into two classes:`
			- grammatical (``accept``) and
			- ungrammatical (``reject``).
			`If a sentence should parse accordng to the grammar, the value of`
			``trees`` will be a non-empty list. If a sentence should be rejected
			according to the grammar, then the value of ``trees`` will be None.
			`"""`
			`for test in self.suite:`
			`print(test['doc'] + ":", end=' ')`
			`for key in ['accept', 'reject']:`
			`for sent in test[key]:`
			`tokens = sent.split()`
			`trees = list(self.cp.parse(tokens))`
			`if show_trees and trees:`
			`print()`
			`print(sent)`
			`for tree in trees:`
			`print(tree)`
			`if key == 'accept':`
			`if trees == []:`
			`raise ValueError("Sentence '%s' failed to parse'" % sent)`
			`else:`
			`accepted = True`
			`else:`
			`if trees:`
			`raise ValueError("Sentence '%s' received a parse'" % sent)`
			`else:`
			`rejected = True`
			`if accepted and rejected:`
			`print("All tests passed!")`

			`def extract_test_sentences(string, comment_chars="#%;", encoding=None):`
			`"""`
			`Parses a string with one test sentence per line.`
			`Lines can optionally begin with:`
			`- a bool, saying if the sentence is grammatical or not, or`
			`- an int, giving the number of parse trees is should have,`
			`The result information is followed by a colon, and then the sentence.`
			`Empty lines and lines beginning with a comment char are ignored.`

			`:return: a list of tuple of sentences and expected results,`
			`where a sentence is a list of str,`
			`and a result is None, or bool, or int`

			:param comment_chars: ``str`` of possible comment characters.
			`:param encoding: the encoding of the string, if it is binary`
			`"""`
			`if encoding is not None:`
			`string = string.decode(encoding)`
			`sentences = []`
			`for sentence in string.split('\n'):`
			`if sentence == '' or sentence[0] in comment_chars:`
			`continue`
			`split_info = sentence.split(':', 1)`
			`result = None`
			`if len(split_info) == 2:`
			`if split_info[0] in ['True','true','False','false']:`
			`result = split_info[0] in ['True','true']`
			`sentence = split_info[1]`
			`else:`
			`result = int(split_info[0])`
			`sentence = split_info[1]`
			`tokens = sentence.split()`
			`if tokens == []:`
			`continue`
			`sentences += [(tokens, result)]`
			`return sentences`

			`# nose thinks it is a test`
			`extract_test_sentences.__test__ = False`