# -*- coding: utf-8 -*- # Natural Language Toolkit: Chart Parser for Feature-Based Grammars # # Copyright (C) 2001-2018 NLTK Project # Author: Rob Speer # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT """ Extension of chart parsing implementation to handle grammars with feature structures as nodes. """ from __future__ import print_function, unicode_literals from six.moves import range from nltk.compat import python_2_unicode_compatible from nltk.featstruct import FeatStruct, unify, TYPE, find_variables from nltk.sem import logic from nltk.tree import Tree from nltk.grammar import (Nonterminal, Production, CFG, FeatStructNonterminal, is_nonterminal, is_terminal) from nltk.parse.chart import (TreeEdge, Chart, ChartParser, EdgeI, FundamentalRule, LeafInitRule, EmptyPredictRule, BottomUpPredictRule, SingleEdgeFundamentalRule, BottomUpPredictCombineRule, CachedTopDownPredictRule, TopDownInitRule) #//////////////////////////////////////////////////////////// # Tree Edge #//////////////////////////////////////////////////////////// @python_2_unicode_compatible class FeatureTreeEdge(TreeEdge): """ A specialized tree edge that allows shared variable bindings between nonterminals on the left-hand side and right-hand side. Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a dictionary mapping from variables to values. If the edge is not complete, then these bindings are simply stored. However, if the edge is complete, then the constructor applies these bindings to every nonterminal in the edge whose symbol implements the interface ``SubstituteBindingsI``. """ def __init__(self, span, lhs, rhs, dot=0, bindings=None): """ Construct a new edge. If the edge is incomplete (i.e., if ``dot alpha \* B1 beta][i:j]`` - ``[B2 -> gamma \*][j:k]`` licenses the edge: - ``[A -> alpha B3 \* beta][i:j]`` assuming that B1 and B2 can be unified to generate B3. """ def apply(self, chart, grammar, left_edge, right_edge): # Make sure the rule is applicable. if not (left_edge.end() == right_edge.start() and left_edge.is_incomplete() and right_edge.is_complete() and isinstance(left_edge, FeatureTreeEdge)): return found = right_edge.lhs() nextsym = left_edge.nextsym() if isinstance(right_edge, FeatureTreeEdge): if not is_nonterminal(nextsym): return if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: return # Create a copy of the bindings. bindings = left_edge.bindings() # We rename vars here, because we don't want variables # from the two different productions to match. found = found.rename_variables(used_vars=left_edge.variables()) # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to # generate B3 (result). result = unify(nextsym, found, bindings, rename_vars=False) if result is None: return else: if nextsym != found: return # Create a copy of the bindings. bindings = left_edge.bindings() # Construct the new edge. new_edge = left_edge.move_dot_forward(right_edge.end(), bindings) # Add it to the chart, with appropriate child pointers. if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): """ A specialized version of the completer / single edge fundamental rule that operates on nonterminals whose symbols are ``FeatStructNonterminal``s. Rather than simply comparing the nonterminals for equality, they are unified. """ _fundamental_rule = FeatureFundamentalRule() def _apply_complete(self, chart, grammar, right_edge): fr = self._fundamental_rule for left_edge in chart.select(end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()): for new_edge in fr.apply(chart, grammar, left_edge, right_edge): yield new_edge def _apply_incomplete(self, chart, grammar, left_edge): fr = self._fundamental_rule for right_edge in chart.select(start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()): for new_edge in fr.apply(chart, grammar, left_edge, right_edge): yield new_edge #//////////////////////////////////////////////////////////// # Top-Down Prediction #//////////////////////////////////////////////////////////// class FeatureTopDownInitRule(TopDownInitRule): def apply(self, chart, grammar): for prod in grammar.productions(lhs=grammar.start()): new_edge = FeatureTreeEdge.from_production(prod, 0) if chart.insert(new_edge, ()): yield new_edge class FeatureTopDownPredictRule(CachedTopDownPredictRule): """ A specialized version of the (cached) top down predict rule that operates on nonterminals whose symbols are ``FeatStructNonterminal``s. Rather than simply comparing the nonterminals for equality, they are unified. The top down expand rule states that: - ``[A -> alpha \* B1 beta][i:j]`` licenses the edge: - ``[B2 -> \* gamma][j:j]`` for each grammar production ``B2 -> gamma``, assuming that B1 and B2 can be unified. """ def apply(self, chart, grammar, edge): if edge.is_complete(): return nextsym, index = edge.nextsym(), edge.end() if not is_nonterminal(nextsym): return # If we've already applied this rule to an edge with the same # next & end, and the chart & grammar have not changed, then # just return (no new edges to add). nextsym_with_bindings = edge.next_with_bindings() done = self._done.get((nextsym_with_bindings, index), (None, None)) if done[0] is chart and done[1] is grammar: return for prod in grammar.productions(lhs=nextsym): # If the left corner in the predicted production is # leaf, it must match with the input. if prod.rhs(): first = prod.rhs()[0] if is_terminal(first): if index >= chart.num_leaves(): continue if first != chart.leaf(index): continue # We rename vars here, because we don't want variables # from the two different productions to match. if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True): new_edge = FeatureTreeEdge.from_production(prod, edge.end()) if chart.insert(new_edge, ()): yield new_edge # Record the fact that we've applied this rule. self._done[nextsym_with_bindings, index] = (chart, grammar) #//////////////////////////////////////////////////////////// # Bottom-Up Prediction #//////////////////////////////////////////////////////////// class FeatureBottomUpPredictRule(BottomUpPredictRule): def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(rhs=edge.lhs()): if isinstance(edge, FeatureTreeEdge): _next = prod.rhs()[0] if not is_nonterminal(_next): continue new_edge = FeatureTreeEdge.from_production(prod, edge.start()) if chart.insert(new_edge, ()): yield new_edge class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule): def apply(self, chart, grammar, edge): if edge.is_incomplete(): return found = edge.lhs() for prod in grammar.productions(rhs=found): bindings = {} if isinstance(edge, FeatureTreeEdge): _next = prod.rhs()[0] if not is_nonterminal(_next): continue # We rename vars here, because we don't want variables # from the two different productions to match. used_vars = find_variables((prod.lhs(),) + prod.rhs(), fs_class=FeatStruct) found = found.rename_variables(used_vars=used_vars) result = unify(_next, found, bindings, rename_vars=False) if result is None: continue new_edge = (FeatureTreeEdge.from_production(prod, edge.start()) .move_dot_forward(edge.end(), bindings)) if chart.insert(new_edge, (edge,)): yield new_edge class FeatureEmptyPredictRule(EmptyPredictRule): def apply(self, chart, grammar): for prod in grammar.productions(empty=True): for index in range(chart.num_leaves() + 1): new_edge = FeatureTreeEdge.from_production(prod, index) if chart.insert(new_edge, ()): yield new_edge #//////////////////////////////////////////////////////////// # Feature Chart Parser #//////////////////////////////////////////////////////////// TD_FEATURE_STRATEGY = [LeafInitRule(), FeatureTopDownInitRule(), FeatureTopDownPredictRule(), FeatureSingleEdgeFundamentalRule()] BU_FEATURE_STRATEGY = [LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictRule(), FeatureSingleEdgeFundamentalRule()] BU_LC_FEATURE_STRATEGY = [LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictCombineRule(), FeatureSingleEdgeFundamentalRule()] class FeatureChartParser(ChartParser): def __init__(self, grammar, strategy=BU_LC_FEATURE_STRATEGY, trace_chart_width=20, chart_class=FeatureChart, **parser_args): ChartParser.__init__(self, grammar, strategy=strategy, trace_chart_width=trace_chart_width, chart_class=chart_class, **parser_args) class FeatureTopDownChartParser(FeatureChartParser): def __init__(self, grammar, **parser_args): FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args) class FeatureBottomUpChartParser(FeatureChartParser): def __init__(self, grammar, **parser_args): FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args) class FeatureBottomUpLeftCornerChartParser(FeatureChartParser): def __init__(self, grammar, **parser_args): FeatureChartParser.__init__(self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args) #//////////////////////////////////////////////////////////// # Instantiate Variable Chart #//////////////////////////////////////////////////////////// class InstantiateVarsChart(FeatureChart): """ A specialized chart that 'instantiates' variables whose names start with '@', by replacing them with unique new variables. In particular, whenever a complete edge is added to the chart, any variables in the edge's ``lhs`` whose names start with '@' will be replaced by unique new ``Variable``s. """ def __init__(self, tokens): FeatureChart.__init__(self, tokens) def initialize(self): self._instantiated = set() FeatureChart.initialize(self) def insert(self, edge, child_pointer_list): if edge in self._instantiated: return False self.instantiate_edge(edge) return FeatureChart.insert(self, edge, child_pointer_list) def instantiate_edge(self, edge): """ If the edge is a ``FeatureTreeEdge``, and it is complete, then instantiate all variables whose names start with '@', by replacing them with unique new variables. Note that instantiation is done in-place, since the parsing algorithms might already hold a reference to the edge for future use. """ # If the edge is a leaf, or is not complete, or is # already in the chart, then just return it as-is. if not isinstance(edge, FeatureTreeEdge): return if not edge.is_complete(): return if edge in self._edge_to_cpls: return # Get a list of variables that need to be instantiated. # If there are none, then return as-is. inst_vars = self.inst_vars(edge) if not inst_vars: return # Instantiate the edge! self._instantiated.add(edge) edge._lhs = edge.lhs().substitute_bindings(inst_vars) def inst_vars(self, edge): return dict((var, logic.unique_variable()) for var in edge.lhs().variables() if var.name.startswith('@')) #//////////////////////////////////////////////////////////// # Demo #//////////////////////////////////////////////////////////// def demo_grammar(): from nltk.grammar import FeatureGrammar return FeatureGrammar.fromstring(""" S -> NP VP PP -> Prep NP NP -> NP PP VP -> VP PP VP -> Verb NP VP -> Verb NP -> Det[pl=?x] Noun[pl=?x] NP -> "John" NP -> "I" Det -> "the" Det -> "my" Det[-pl] -> "a" Noun[-pl] -> "dog" Noun[-pl] -> "cookie" Verb -> "ate" Verb -> "saw" Prep -> "with" Prep -> "under" """) def demo(print_times=True, print_grammar=True, print_trees=True, print_sentence=True, trace=1, parser=FeatureChartParser, sent='I saw John with a dog with my cookie'): import sys, time print() grammar = demo_grammar() if print_grammar: print(grammar) print() print("*", parser.__name__) if print_sentence: print("Sentence:", sent) tokens = sent.split() t = time.clock() cp = parser(grammar, trace=trace) chart = cp.chart_parse(tokens) trees = list(chart.parses(grammar.start())) if print_times: print("Time: %s" % (time.clock() - t)) if print_trees: for tree in trees: print(tree) else: print("Nr trees:", len(trees)) def run_profile(): import profile profile.run('for i in range(1): demo()', '/tmp/profile.out') import pstats p = pstats.Stats('/tmp/profile.out') p.strip_dirs().sort_stats('time', 'cum').print_stats(60) p.strip_dirs().sort_stats('cum', 'time').print_stats(60) if __name__ == '__main__': from nltk.data import load demo() print() grammar = load('grammars/book_grammars/feat0.fcfg') cp = FeatureChartParser(grammar, trace=2) sent = 'Kim likes children' tokens = sent.split() trees = cp.parse(tokens) for tree in trees: print(tree)