# Natural Language Toolkit: Parser Utility Functions # # Author: Ewan Klein # # Copyright (C) 2001-2018 NLTK Project # URL: # For license information, see LICENSE.TXT """ Utility functions for parsers. """ from __future__ import print_function from nltk.grammar import CFG, FeatureGrammar, PCFG from nltk.data import load from nltk.parse.chart import Chart, ChartParser from nltk.parse.pchart import InsideChartParser from nltk.parse.featurechart import FeatureChart, FeatureChartParser def load_parser(grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args): """ Load a grammar from a file, and build a parser based on that grammar. The parser depends on the grammar format, and might also depend on properties of the grammar itself. The following grammar formats are currently supported: - ``'cfg'`` (CFGs: ``CFG``) - ``'pcfg'`` (probabilistic CFGs: ``PCFG``) - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``) :type grammar_url: str :param grammar_url: A URL specifying where the grammar is located. The default protocol is ``"nltk:"``, which searches for the file in the the NLTK data package. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. :param parser: The class used for parsing; should be ``ChartParser`` or a subclass. If None, the class depends on the grammar format. :param chart_class: The class used for storing the chart; should be ``Chart`` or a subclass. Only used for CFGs and feature CFGs. If None, the chart class depends on the grammar format. :type beam_size: int :param beam_size: The maximum length for the parser's edge queue. Only used for probabilistic CFGs. :param load_args: Keyword parameters used when loading the grammar. See ``data.load`` for more information. """ grammar = load(grammar_url, **load_args) if not isinstance(grammar, CFG): raise ValueError("The grammar must be a CFG, " "or a subclass thereof.") if isinstance(grammar, PCFG): if parser is None: parser = InsideChartParser return parser(grammar, trace=trace, beam_size=beam_size) elif isinstance(grammar, FeatureGrammar): if parser is None: parser = FeatureChartParser if chart_class is None: chart_class = FeatureChart return parser(grammar, trace=trace, chart_class=chart_class) else: # Plain CFG. if parser is None: parser = ChartParser if chart_class is None: chart_class = Chart return parser(grammar, trace=trace, chart_class=chart_class) def taggedsent_to_conll(sentence): """ A module to convert a single POS tagged sentence into CONLL format. >>> from nltk import word_tokenize, pos_tag >>> text = "This is a foobar sentence." >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): ... print(line, end="") 1 This _ DT DT _ 0 a _ _ 2 is _ VBZ VBZ _ 0 a _ _ 3 a _ DT DT _ 0 a _ _ 4 foobar _ JJ JJ _ 0 a _ _ 5 sentence _ NN NN _ 0 a _ _ 6 . _ . . _ 0 a _ _ :param sentence: A single input sentence to parse :type sentence: list(tuple(str, str)) :rtype: iter(str) :return: a generator yielding a single sentence in CONLL format. """ for (i, (word, tag)) in enumerate(sentence, start=1): input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_'] input_str = "\t".join(input_str) + "\n" yield input_str def taggedsents_to_conll(sentences): """ A module to convert the a POS tagged document stream (i.e. list of list of tuples, a list of sentences) and yield lines in CONLL format. This module yields one line per word and two newlines for end of sentence. >>> from nltk import word_tokenize, sent_tokenize, pos_tag >>> text = "This is a foobar sentence. Is that right?" >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)] >>> for line in taggedsents_to_conll(sentences): ... if line: ... print(line, end="") 1 This _ DT DT _ 0 a _ _ 2 is _ VBZ VBZ _ 0 a _ _ 3 a _ DT DT _ 0 a _ _ 4 foobar _ JJ JJ _ 0 a _ _ 5 sentence _ NN NN _ 0 a _ _ 6 . _ . . _ 0 a _ _ 1 Is _ VBZ VBZ _ 0 a _ _ 2 that _ IN IN _ 0 a _ _ 3 right _ NN NN _ 0 a _ _ 4 ? _ . . _ 0 a _ _ :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :rtype: iter(str) :return: a generator yielding sentences in CONLL format. """ for sentence in sentences: for input_str in taggedsent_to_conll(sentence): yield input_str yield '\n\n' ###################################################################### #{ Test Suites ###################################################################### class TestGrammar(object): """ Unit tests for CFG. """ def __init__(self, grammar, suite, accept=None, reject=None): self.test_grammar = grammar self.cp = load_parser(grammar, trace=0) self.suite = suite self._accept = accept self._reject = reject def run(self, show_trees=False): """ Sentences in the test suite are divided into two classes: - grammatical (``accept``) and - ungrammatical (``reject``). If a sentence should parse accordng to the grammar, the value of ``trees`` will be a non-empty list. If a sentence should be rejected according to the grammar, then the value of ``trees`` will be None. """ for test in self.suite: print(test['doc'] + ":", end=' ') for key in ['accept', 'reject']: for sent in test[key]: tokens = sent.split() trees = list(self.cp.parse(tokens)) if show_trees and trees: print() print(sent) for tree in trees: print(tree) if key == 'accept': if trees == []: raise ValueError("Sentence '%s' failed to parse'" % sent) else: accepted = True else: if trees: raise ValueError("Sentence '%s' received a parse'" % sent) else: rejected = True if accepted and rejected: print("All tests passed!") def extract_test_sentences(string, comment_chars="#%;", encoding=None): """ Parses a string with one test sentence per line. Lines can optionally begin with: - a bool, saying if the sentence is grammatical or not, or - an int, giving the number of parse trees is should have, The result information is followed by a colon, and then the sentence. Empty lines and lines beginning with a comment char are ignored. :return: a list of tuple of sentences and expected results, where a sentence is a list of str, and a result is None, or bool, or int :param comment_chars: ``str`` of possible comment characters. :param encoding: the encoding of the string, if it is binary """ if encoding is not None: string = string.decode(encoding) sentences = [] for sentence in string.split('\n'): if sentence == '' or sentence[0] in comment_chars: continue split_info = sentence.split(':', 1) result = None if len(split_info) == 2: if split_info[0] in ['True','true','False','false']: result = split_info[0] in ['True','true'] sentence = split_info[1] else: result = int(split_info[0]) sentence = split_info[1] tokens = sentence.split() if tokens == []: continue sentences += [(tokens, result)] return sentences # nose thinks it is a test extract_test_sentences.__test__ = False