1682 lines
60 KiB
Python
1682 lines
60 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Natural Language Toolkit: A Chart Parser
|
|
#
|
|
# Copyright (C) 2001-2018 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# Steven Bird <stevenbird1@gmail.com>
|
|
# Jean Mark Gawron <gawron@mail.sdsu.edu>
|
|
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
Data classes and parser implementations for "chart parsers", which
|
|
use dynamic programming to efficiently parse a text. A chart
|
|
parser derives parse trees for a text by iteratively adding "edges"
|
|
to a "chart." Each edge represents a hypothesis about the tree
|
|
structure for a subsequence of the text. The chart is a
|
|
"blackboard" for composing and combining these hypotheses.
|
|
|
|
When a chart parser begins parsing a text, it creates a new (empty)
|
|
chart, spanning the text. It then incrementally adds new edges to the
|
|
chart. A set of "chart rules" specifies the conditions under which
|
|
new edges should be added to the chart. Once the chart reaches a
|
|
stage where none of the chart rules adds any new edges, parsing is
|
|
complete.
|
|
|
|
Charts are encoded with the ``Chart`` class, and edges are encoded with
|
|
the ``TreeEdge`` and ``LeafEdge`` classes. The chart parser module
|
|
defines three chart parsers:
|
|
|
|
- ``ChartParser`` is a simple and flexible chart parser. Given a
|
|
set of chart rules, it will apply those rules to the chart until
|
|
no more edges are added.
|
|
|
|
- ``SteppingChartParser`` is a subclass of ``ChartParser`` that can
|
|
be used to step through the parsing process.
|
|
"""
|
|
from __future__ import print_function, division, unicode_literals
|
|
|
|
import itertools
|
|
import re
|
|
import warnings
|
|
from functools import total_ordering
|
|
|
|
from six.moves import range
|
|
|
|
from nltk.tree import Tree
|
|
from nltk.grammar import PCFG, is_nonterminal, is_terminal
|
|
from nltk.util import OrderedDict
|
|
from nltk.internals import raise_unorderable_types
|
|
from nltk.compat import python_2_unicode_compatible, unicode_repr
|
|
|
|
from nltk.parse.api import ParserI
|
|
|
|
|
|
########################################################################
|
|
## Edges
|
|
########################################################################
|
|
|
|
@total_ordering
|
|
class EdgeI(object):
|
|
"""
|
|
A hypothesis about the structure of part of a sentence.
|
|
Each edge records the fact that a structure is (partially)
|
|
consistent with the sentence. An edge contains:
|
|
|
|
- A span, indicating what part of the sentence is
|
|
consistent with the hypothesized structure.
|
|
- A left-hand side, specifying what kind of structure is
|
|
hypothesized.
|
|
- A right-hand side, specifying the contents of the
|
|
hypothesized structure.
|
|
- A dot position, indicating how much of the hypothesized
|
|
structure is consistent with the sentence.
|
|
|
|
Every edge is either complete or incomplete:
|
|
|
|
- An edge is complete if its structure is fully consistent
|
|
with the sentence.
|
|
- An edge is incomplete if its structure is partially
|
|
consistent with the sentence. For every incomplete edge, the
|
|
span specifies a possible prefix for the edge's structure.
|
|
|
|
There are two kinds of edge:
|
|
|
|
- A ``TreeEdge`` records which trees have been found to
|
|
be (partially) consistent with the text.
|
|
- A ``LeafEdge`` records the tokens occurring in the text.
|
|
|
|
The ``EdgeI`` interface provides a common interface to both types
|
|
of edge, allowing chart parsers to treat them in a uniform manner.
|
|
"""
|
|
def __init__(self):
|
|
if self.__class__ == EdgeI:
|
|
raise TypeError('Edge is an abstract interface')
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Span
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def span(self):
|
|
"""
|
|
Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the
|
|
portion of the sentence that is consistent with this
|
|
edge's structure.
|
|
|
|
:rtype: tuple(int, int)
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def start(self):
|
|
"""
|
|
Return the start index of this edge's span.
|
|
|
|
:rtype: int
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def end(self):
|
|
"""
|
|
Return the end index of this edge's span.
|
|
|
|
:rtype: int
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def length(self):
|
|
"""
|
|
Return the length of this edge's span.
|
|
|
|
:rtype: int
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Left Hand Side
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def lhs(self):
|
|
"""
|
|
Return this edge's left-hand side, which specifies what kind
|
|
of structure is hypothesized by this edge.
|
|
|
|
:see: ``TreeEdge`` and ``LeafEdge`` for a description of
|
|
the left-hand side values for each edge type.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Right Hand Side
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def rhs(self):
|
|
"""
|
|
Return this edge's right-hand side, which specifies
|
|
the content of the structure hypothesized by this edge.
|
|
|
|
:see: ``TreeEdge`` and ``LeafEdge`` for a description of
|
|
the right-hand side values for each edge type.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def dot(self):
|
|
"""
|
|
Return this edge's dot position, which indicates how much of
|
|
the hypothesized structure is consistent with the
|
|
sentence. In particular, ``self.rhs[:dot]`` is consistent
|
|
with ``tokens[self.start():self.end()]``.
|
|
|
|
:rtype: int
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def nextsym(self):
|
|
"""
|
|
Return the element of this edge's right-hand side that
|
|
immediately follows its dot.
|
|
|
|
:rtype: Nonterminal or terminal or None
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def is_complete(self):
|
|
"""
|
|
Return True if this edge's structure is fully consistent
|
|
with the text.
|
|
|
|
:rtype: bool
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def is_incomplete(self):
|
|
"""
|
|
Return True if this edge's structure is partially consistent
|
|
with the text.
|
|
|
|
:rtype: bool
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Comparisons & hashing
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def __eq__(self, other):
|
|
return (self.__class__ is other.__class__ and
|
|
self._comparison_key == other._comparison_key)
|
|
|
|
def __ne__(self, other):
|
|
return not self == other
|
|
|
|
def __lt__(self, other):
|
|
if not isinstance(other, EdgeI):
|
|
raise_unorderable_types("<", self, other)
|
|
if self.__class__ is other.__class__:
|
|
return self._comparison_key < other._comparison_key
|
|
else:
|
|
return self.__class__.__name__ < other.__class__.__name__
|
|
|
|
def __hash__(self):
|
|
try:
|
|
return self._hash
|
|
except AttributeError:
|
|
self._hash = hash(self._comparison_key)
|
|
return self._hash
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class TreeEdge(EdgeI):
|
|
"""
|
|
An edge that records the fact that a tree is (partially)
|
|
consistent with the sentence. A tree edge consists of:
|
|
|
|
- A span, indicating what part of the sentence is
|
|
consistent with the hypothesized tree.
|
|
- A left-hand side, specifying the hypothesized tree's node
|
|
value.
|
|
- A right-hand side, specifying the hypothesized tree's
|
|
children. Each element of the right-hand side is either a
|
|
terminal, specifying a token with that terminal as its leaf
|
|
value; or a nonterminal, specifying a subtree with that
|
|
nonterminal's symbol as its node value.
|
|
- A dot position, indicating which children are consistent
|
|
with part of the sentence. In particular, if ``dot`` is the
|
|
dot position, ``rhs`` is the right-hand size, ``(start,end)``
|
|
is the span, and ``sentence`` is the list of tokens in the
|
|
sentence, then ``tokens[start:end]`` can be spanned by the
|
|
children specified by ``rhs[:dot]``.
|
|
|
|
For more information about edges, see the ``EdgeI`` interface.
|
|
"""
|
|
def __init__(self, span, lhs, rhs, dot=0):
|
|
"""
|
|
Construct a new ``TreeEdge``.
|
|
|
|
:type span: tuple(int, int)
|
|
:param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the
|
|
portion of the sentence that is consistent with the new
|
|
edge's structure.
|
|
:type lhs: Nonterminal
|
|
:param lhs: The new edge's left-hand side, specifying the
|
|
hypothesized tree's node value.
|
|
:type rhs: list(Nonterminal and str)
|
|
:param rhs: The new edge's right-hand side, specifying the
|
|
hypothesized tree's children.
|
|
:type dot: int
|
|
:param dot: The position of the new edge's dot. This position
|
|
specifies what prefix of the production's right hand side
|
|
is consistent with the text. In particular, if
|
|
``sentence`` is the list of tokens in the sentence, then
|
|
``okens[span[0]:span[1]]`` can be spanned by the
|
|
children specified by ``rhs[:dot]``.
|
|
"""
|
|
self._span = span
|
|
self._lhs = lhs
|
|
rhs = tuple(rhs)
|
|
self._rhs = rhs
|
|
self._dot = dot
|
|
self._comparison_key = (span, lhs, rhs, dot)
|
|
|
|
@staticmethod
|
|
def from_production(production, index):
|
|
"""
|
|
Return a new ``TreeEdge`` formed from the given production.
|
|
The new edge's left-hand side and right-hand side will
|
|
be taken from ``production``; its span will be
|
|
``(index,index)``; and its dot position will be ``0``.
|
|
|
|
:rtype: TreeEdge
|
|
"""
|
|
return TreeEdge(span=(index, index), lhs=production.lhs(),
|
|
rhs=production.rhs(), dot=0)
|
|
|
|
def move_dot_forward(self, new_end):
|
|
"""
|
|
Return a new ``TreeEdge`` formed from this edge.
|
|
The new edge's dot position is increased by ``1``,
|
|
and its end index will be replaced by ``new_end``.
|
|
|
|
:param new_end: The new end index.
|
|
:type new_end: int
|
|
:rtype: TreeEdge
|
|
"""
|
|
return TreeEdge(span=(self._span[0], new_end),
|
|
lhs=self._lhs, rhs=self._rhs,
|
|
dot=self._dot+1)
|
|
|
|
# Accessors
|
|
def lhs(self): return self._lhs
|
|
def span(self): return self._span
|
|
def start(self): return self._span[0]
|
|
def end(self): return self._span[1]
|
|
def length(self): return self._span[1] - self._span[0]
|
|
def rhs(self): return self._rhs
|
|
def dot(self): return self._dot
|
|
def is_complete(self): return self._dot == len(self._rhs)
|
|
def is_incomplete(self): return self._dot != len(self._rhs)
|
|
def nextsym(self):
|
|
if self._dot >= len(self._rhs): return None
|
|
else: return self._rhs[self._dot]
|
|
|
|
# String representation
|
|
def __str__(self):
|
|
str = '[%s:%s] ' % (self._span[0], self._span[1])
|
|
str += '%-2r ->' % (self._lhs,)
|
|
|
|
for i in range(len(self._rhs)):
|
|
if i == self._dot: str += ' *'
|
|
str += ' %s' % unicode_repr(self._rhs[i])
|
|
if len(self._rhs) == self._dot: str += ' *'
|
|
return str
|
|
|
|
def __repr__(self):
|
|
return '[Edge: %s]' % self
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class LeafEdge(EdgeI):
|
|
"""
|
|
An edge that records the fact that a leaf value is consistent with
|
|
a word in the sentence. A leaf edge consists of:
|
|
|
|
- An index, indicating the position of the word.
|
|
- A leaf, specifying the word's content.
|
|
|
|
A leaf edge's left-hand side is its leaf value, and its right hand
|
|
side is ``()``. Its span is ``[index, index+1]``, and its dot
|
|
position is ``0``.
|
|
"""
|
|
def __init__(self, leaf, index):
|
|
"""
|
|
Construct a new ``LeafEdge``.
|
|
|
|
:param leaf: The new edge's leaf value, specifying the word
|
|
that is recorded by this edge.
|
|
:param index: The new edge's index, specifying the position of
|
|
the word that is recorded by this edge.
|
|
"""
|
|
self._leaf = leaf
|
|
self._index = index
|
|
self._comparison_key = (leaf, index)
|
|
|
|
# Accessors
|
|
def lhs(self): return self._leaf
|
|
def span(self): return (self._index, self._index+1)
|
|
def start(self): return self._index
|
|
def end(self): return self._index+1
|
|
def length(self): return 1
|
|
def rhs(self): return ()
|
|
def dot(self): return 0
|
|
def is_complete(self): return True
|
|
def is_incomplete(self): return False
|
|
def nextsym(self): return None
|
|
|
|
# String representations
|
|
def __str__(self):
|
|
return '[%s:%s] %s' % (self._index, self._index+1, unicode_repr(self._leaf))
|
|
def __repr__(self):
|
|
return '[Edge: %s]' % (self)
|
|
|
|
########################################################################
|
|
## Chart
|
|
########################################################################
|
|
|
|
class Chart(object):
|
|
"""
|
|
A blackboard for hypotheses about the syntactic constituents of a
|
|
sentence. A chart contains a set of edges, and each edge encodes
|
|
a single hypothesis about the structure of some portion of the
|
|
sentence.
|
|
|
|
The ``select`` method can be used to select a specific collection
|
|
of edges. For example ``chart.select(is_complete=True, start=0)``
|
|
yields all complete edges whose start indices are 0. To ensure
|
|
the efficiency of these selection operations, ``Chart`` dynamically
|
|
creates and maintains an index for each set of attributes that
|
|
have been selected on.
|
|
|
|
In order to reconstruct the trees that are represented by an edge,
|
|
the chart associates each edge with a set of child pointer lists.
|
|
A child pointer list is a list of the edges that license an
|
|
edge's right-hand side.
|
|
|
|
:ivar _tokens: The sentence that the chart covers.
|
|
:ivar _num_leaves: The number of tokens.
|
|
:ivar _edges: A list of the edges in the chart
|
|
:ivar _edge_to_cpls: A dictionary mapping each edge to a set
|
|
of child pointer lists that are associated with that edge.
|
|
:ivar _indexes: A dictionary mapping tuples of edge attributes
|
|
to indices, where each index maps the corresponding edge
|
|
attribute values to lists of edges.
|
|
"""
|
|
def __init__(self, tokens):
|
|
"""
|
|
Construct a new chart. The chart is initialized with the
|
|
leaf edges corresponding to the terminal leaves.
|
|
|
|
:type tokens: list
|
|
:param tokens: The sentence that this chart will be used to parse.
|
|
"""
|
|
# Record the sentence token and the sentence length.
|
|
self._tokens = tuple(tokens)
|
|
self._num_leaves = len(self._tokens)
|
|
|
|
# Initialise the chart.
|
|
self.initialize()
|
|
|
|
def initialize(self):
|
|
"""
|
|
Clear the chart.
|
|
"""
|
|
# A list of edges contained in this chart.
|
|
self._edges = []
|
|
|
|
# The set of child pointer lists associated with each edge.
|
|
self._edge_to_cpls = {}
|
|
|
|
# Indexes mapping attribute values to lists of edges
|
|
# (used by select()).
|
|
self._indexes = {}
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Sentence Access
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def num_leaves(self):
|
|
"""
|
|
Return the number of words in this chart's sentence.
|
|
|
|
:rtype: int
|
|
"""
|
|
return self._num_leaves
|
|
|
|
def leaf(self, index):
|
|
"""
|
|
Return the leaf value of the word at the given index.
|
|
|
|
:rtype: str
|
|
"""
|
|
return self._tokens[index]
|
|
|
|
def leaves(self):
|
|
"""
|
|
Return a list of the leaf values of each word in the
|
|
chart's sentence.
|
|
|
|
:rtype: list(str)
|
|
"""
|
|
return self._tokens
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Edge access
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def edges(self):
|
|
"""
|
|
Return a list of all edges in this chart. New edges
|
|
that are added to the chart after the call to edges()
|
|
will *not* be contained in this list.
|
|
|
|
:rtype: list(EdgeI)
|
|
:see: ``iteredges``, ``select``
|
|
"""
|
|
return self._edges[:]
|
|
|
|
def iteredges(self):
|
|
"""
|
|
Return an iterator over the edges in this chart. It is
|
|
not guaranteed that new edges which are added to the
|
|
chart before the iterator is exhausted will also be generated.
|
|
|
|
:rtype: iter(EdgeI)
|
|
:see: ``edges``, ``select``
|
|
"""
|
|
return iter(self._edges)
|
|
|
|
# Iterating over the chart yields its edges.
|
|
__iter__ = iteredges
|
|
|
|
def num_edges(self):
|
|
"""
|
|
Return the number of edges contained in this chart.
|
|
|
|
:rtype: int
|
|
"""
|
|
return len(self._edge_to_cpls)
|
|
|
|
def select(self, **restrictions):
|
|
"""
|
|
Return an iterator over the edges in this chart. Any
|
|
new edges that are added to the chart before the iterator
|
|
is exahusted will also be generated. ``restrictions``
|
|
can be used to restrict the set of edges that will be
|
|
generated.
|
|
|
|
:param span: Only generate edges ``e`` where ``e.span()==span``
|
|
:param start: Only generate edges ``e`` where ``e.start()==start``
|
|
:param end: Only generate edges ``e`` where ``e.end()==end``
|
|
:param length: Only generate edges ``e`` where ``e.length()==length``
|
|
:param lhs: Only generate edges ``e`` where ``e.lhs()==lhs``
|
|
:param rhs: Only generate edges ``e`` where ``e.rhs()==rhs``
|
|
:param nextsym: Only generate edges ``e`` where
|
|
``e.nextsym()==nextsym``
|
|
:param dot: Only generate edges ``e`` where ``e.dot()==dot``
|
|
:param is_complete: Only generate edges ``e`` where
|
|
``e.is_complete()==is_complete``
|
|
:param is_incomplete: Only generate edges ``e`` where
|
|
``e.is_incomplete()==is_incomplete``
|
|
:rtype: iter(EdgeI)
|
|
"""
|
|
# If there are no restrictions, then return all edges.
|
|
if restrictions=={}: return iter(self._edges)
|
|
|
|
# Find the index corresponding to the given restrictions.
|
|
restr_keys = sorted(restrictions.keys())
|
|
restr_keys = tuple(restr_keys)
|
|
|
|
# If it doesn't exist, then create it.
|
|
if restr_keys not in self._indexes:
|
|
self._add_index(restr_keys)
|
|
|
|
vals = tuple(restrictions[key] for key in restr_keys)
|
|
return iter(self._indexes[restr_keys].get(vals, []))
|
|
|
|
def _add_index(self, restr_keys):
|
|
"""
|
|
A helper function for ``select``, which creates a new index for
|
|
a given set of attributes (aka restriction keys).
|
|
"""
|
|
# Make sure it's a valid index.
|
|
for key in restr_keys:
|
|
if not hasattr(EdgeI, key):
|
|
raise ValueError('Bad restriction: %s' % key)
|
|
|
|
# Create the index.
|
|
index = self._indexes[restr_keys] = {}
|
|
|
|
# Add all existing edges to the index.
|
|
for edge in self._edges:
|
|
vals = tuple(getattr(edge, key)() for key in restr_keys)
|
|
index.setdefault(vals, []).append(edge)
|
|
|
|
def _register_with_indexes(self, edge):
|
|
"""
|
|
A helper function for ``insert``, which registers the new
|
|
edge with all existing indexes.
|
|
"""
|
|
for (restr_keys, index) in self._indexes.items():
|
|
vals = tuple(getattr(edge, key)() for key in restr_keys)
|
|
index.setdefault(vals, []).append(edge)
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Edge Insertion
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def insert_with_backpointer(self, new_edge, previous_edge, child_edge):
|
|
"""
|
|
Add a new edge to the chart, using a pointer to the previous edge.
|
|
"""
|
|
cpls = self.child_pointer_lists(previous_edge)
|
|
new_cpls = [cpl+(child_edge,) for cpl in cpls]
|
|
return self.insert(new_edge, *new_cpls)
|
|
|
|
def insert(self, edge, *child_pointer_lists):
|
|
"""
|
|
Add a new edge to the chart, and return True if this operation
|
|
modified the chart. In particular, return true iff the chart
|
|
did not already contain ``edge``, or if it did not already associate
|
|
``child_pointer_lists`` with ``edge``.
|
|
|
|
:type edge: EdgeI
|
|
:param edge: The new edge
|
|
:type child_pointer_lists: sequence of tuple(EdgeI)
|
|
:param child_pointer_lists: A sequence of lists of the edges that
|
|
were used to form this edge. This list is used to reconstruct
|
|
the trees (or partial trees) that are associated with ``edge``.
|
|
:rtype: bool
|
|
"""
|
|
# Is it a new edge?
|
|
if edge not in self._edge_to_cpls:
|
|
# Add it to the list of edges.
|
|
self._append_edge(edge)
|
|
# Register with indexes.
|
|
self._register_with_indexes(edge)
|
|
|
|
# Get the set of child pointer lists for this edge.
|
|
cpls = self._edge_to_cpls.setdefault(edge, OrderedDict())
|
|
chart_was_modified = False
|
|
for child_pointer_list in child_pointer_lists:
|
|
child_pointer_list = tuple(child_pointer_list)
|
|
if child_pointer_list not in cpls:
|
|
# It's a new CPL; register it, and return true.
|
|
cpls[child_pointer_list] = True
|
|
chart_was_modified = True
|
|
return chart_was_modified
|
|
|
|
def _append_edge(self, edge):
|
|
self._edges.append(edge)
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Tree extraction & child pointer lists
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def parses(self, root, tree_class=Tree):
|
|
"""
|
|
Return an iterator of the complete tree structures that span
|
|
the entire chart, and whose root node is ``root``.
|
|
"""
|
|
for edge in self.select(start=0, end=self._num_leaves, lhs=root):
|
|
for tree in self.trees(edge, tree_class=tree_class, complete=True):
|
|
yield tree
|
|
|
|
def trees(self, edge, tree_class=Tree, complete=False):
|
|
"""
|
|
Return an iterator of the tree structures that are associated
|
|
with ``edge``.
|
|
|
|
If ``edge`` is incomplete, then the unexpanded children will be
|
|
encoded as childless subtrees, whose node value is the
|
|
corresponding terminal or nonterminal.
|
|
|
|
:rtype: list(Tree)
|
|
:note: If two trees share a common subtree, then the same
|
|
Tree may be used to encode that subtree in
|
|
both trees. If you need to eliminate this subtree
|
|
sharing, then create a deep copy of each tree.
|
|
"""
|
|
return iter(self._trees(edge, complete, memo={}, tree_class=tree_class))
|
|
|
|
def _trees(self, edge, complete, memo, tree_class):
|
|
"""
|
|
A helper function for ``trees``.
|
|
|
|
:param memo: A dictionary used to record the trees that we've
|
|
generated for each edge, so that when we see an edge more
|
|
than once, we can reuse the same trees.
|
|
"""
|
|
# If we've seen this edge before, then reuse our old answer.
|
|
if edge in memo:
|
|
return memo[edge]
|
|
|
|
# when we're reading trees off the chart, don't use incomplete edges
|
|
if complete and edge.is_incomplete():
|
|
return []
|
|
|
|
# Leaf edges.
|
|
if isinstance(edge, LeafEdge):
|
|
leaf = self._tokens[edge.start()]
|
|
memo[edge] = [leaf]
|
|
return [leaf]
|
|
|
|
# Until we're done computing the trees for edge, set
|
|
# memo[edge] to be empty. This has the effect of filtering
|
|
# out any cyclic trees (i.e., trees that contain themselves as
|
|
# descendants), because if we reach this edge via a cycle,
|
|
# then it will appear that the edge doesn't generate any trees.
|
|
memo[edge] = []
|
|
trees = []
|
|
lhs = edge.lhs().symbol()
|
|
|
|
# Each child pointer list can be used to form trees.
|
|
for cpl in self.child_pointer_lists(edge):
|
|
# Get the set of child choices for each child pointer.
|
|
# child_choices[i] is the set of choices for the tree's
|
|
# ith child.
|
|
child_choices = [self._trees(cp, complete, memo, tree_class)
|
|
for cp in cpl]
|
|
|
|
# For each combination of children, add a tree.
|
|
for children in itertools.product(*child_choices):
|
|
trees.append(tree_class(lhs, children))
|
|
|
|
# If the edge is incomplete, then extend it with "partial trees":
|
|
if edge.is_incomplete():
|
|
unexpanded = [tree_class(elt,[])
|
|
for elt in edge.rhs()[edge.dot():]]
|
|
for tree in trees:
|
|
tree.extend(unexpanded)
|
|
|
|
# Update the memoization dictionary.
|
|
memo[edge] = trees
|
|
|
|
# Return the list of trees.
|
|
return trees
|
|
|
|
def child_pointer_lists(self, edge):
|
|
"""
|
|
Return the set of child pointer lists for the given edge.
|
|
Each child pointer list is a list of edges that have
|
|
been used to form this edge.
|
|
|
|
:rtype: list(list(EdgeI))
|
|
"""
|
|
# Make a copy, in case they modify it.
|
|
return self._edge_to_cpls.get(edge, {}).keys()
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Display
|
|
#////////////////////////////////////////////////////////////
|
|
def pretty_format_edge(self, edge, width=None):
|
|
"""
|
|
Return a pretty-printed string representation of a given edge
|
|
in this chart.
|
|
|
|
:rtype: str
|
|
:param width: The number of characters allotted to each
|
|
index in the sentence.
|
|
"""
|
|
if width is None: width = 50 // (self.num_leaves()+1)
|
|
(start, end) = (edge.start(), edge.end())
|
|
|
|
str = '|' + ('.'+' '*(width-1))*start
|
|
|
|
# Zero-width edges are "#" if complete, ">" if incomplete
|
|
if start == end:
|
|
if edge.is_complete(): str += '#'
|
|
else: str += '>'
|
|
|
|
# Spanning complete edges are "[===]"; Other edges are
|
|
# "[---]" if complete, "[--->" if incomplete
|
|
elif edge.is_complete() and edge.span() == (0,self._num_leaves):
|
|
str += '['+('='*width)*(end-start-1) + '='*(width-1)+']'
|
|
elif edge.is_complete():
|
|
str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+']'
|
|
else:
|
|
str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+'>'
|
|
|
|
str += (' '*(width-1)+'.')*(self._num_leaves-end)
|
|
return str + '| %s' % edge
|
|
|
|
def pretty_format_leaves(self, width=None):
|
|
"""
|
|
Return a pretty-printed string representation of this
|
|
chart's leaves. This string can be used as a header
|
|
for calls to ``pretty_format_edge``.
|
|
"""
|
|
if width is None: width = 50 // (self.num_leaves()+1)
|
|
|
|
if self._tokens is not None and width>1:
|
|
header = '|.'
|
|
for tok in self._tokens:
|
|
header += tok[:width-1].center(width-1)+'.'
|
|
header += '|'
|
|
else:
|
|
header = ''
|
|
|
|
return header
|
|
|
|
def pretty_format(self, width=None):
|
|
"""
|
|
Return a pretty-printed string representation of this chart.
|
|
|
|
:param width: The number of characters allotted to each
|
|
index in the sentence.
|
|
:rtype: str
|
|
"""
|
|
if width is None: width = 50 // (self.num_leaves()+1)
|
|
# sort edges: primary key=length, secondary key=start index.
|
|
# (and filter out the token edges)
|
|
edges = sorted([(e.length(), e.start(), e) for e in self])
|
|
edges = [e for (_,_,e) in edges]
|
|
|
|
return (self.pretty_format_leaves(width) + '\n' +
|
|
'\n'.join(self.pretty_format_edge(edge, width) for edge in edges))
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Display: Dot (AT&T Graphviz)
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def dot_digraph(self):
|
|
# Header
|
|
s = 'digraph nltk_chart {\n'
|
|
#s += ' size="5,5";\n'
|
|
s += ' rankdir=LR;\n'
|
|
s += ' node [height=0.1,width=0.1];\n'
|
|
s += ' node [style=filled, color="lightgray"];\n'
|
|
|
|
# Set up the nodes
|
|
for y in range(self.num_edges(), -1, -1):
|
|
if y == 0:
|
|
s += ' node [style=filled, color="black"];\n'
|
|
for x in range(self.num_leaves()+1):
|
|
if y == 0 or (x <= self._edges[y-1].start() or
|
|
x >= self._edges[y-1].end()):
|
|
s += ' %04d.%04d [label=""];\n' % (x,y)
|
|
|
|
# Add a spacer
|
|
s += ' x [style=invis]; x->0000.0000 [style=invis];\n'
|
|
|
|
# Declare ranks.
|
|
for x in range(self.num_leaves()+1):
|
|
s += ' {rank=same;'
|
|
for y in range(self.num_edges()+1):
|
|
if y == 0 or (x <= self._edges[y-1].start() or
|
|
x >= self._edges[y-1].end()):
|
|
s += ' %04d.%04d' % (x,y)
|
|
s += '}\n'
|
|
|
|
# Add the leaves
|
|
s += ' edge [style=invis, weight=100];\n'
|
|
s += ' node [shape=plaintext]\n'
|
|
s += ' 0000.0000'
|
|
for x in range(self.num_leaves()):
|
|
s += '->%s->%04d.0000' % (self.leaf(x), x+1)
|
|
s += ';\n\n'
|
|
|
|
# Add the edges
|
|
s += ' edge [style=solid, weight=1];\n'
|
|
for y, edge in enumerate(self):
|
|
for x in range(edge.start()):
|
|
s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
|
|
(x, y+1, x+1, y+1))
|
|
s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' %
|
|
(edge.start(), y+1, edge.end(), y+1, edge))
|
|
for x in range(edge.end(), self.num_leaves()):
|
|
s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
|
|
(x, y+1, x+1, y+1))
|
|
s += '}\n'
|
|
return s
|
|
|
|
########################################################################
|
|
## Chart Rules
|
|
########################################################################
|
|
|
|
class ChartRuleI(object):
|
|
"""
|
|
A rule that specifies what new edges are licensed by any given set
|
|
of existing edges. Each chart rule expects a fixed number of
|
|
edges, as indicated by the class variable ``NUM_EDGES``. In
|
|
particular:
|
|
|
|
- A chart rule with ``NUM_EDGES=0`` specifies what new edges are
|
|
licensed, regardless of existing edges.
|
|
- A chart rule with ``NUM_EDGES=1`` specifies what new edges are
|
|
licensed by a single existing edge.
|
|
- A chart rule with ``NUM_EDGES=2`` specifies what new edges are
|
|
licensed by a pair of existing edges.
|
|
|
|
:type NUM_EDGES: int
|
|
:cvar NUM_EDGES: The number of existing edges that this rule uses
|
|
to license new edges. Typically, this number ranges from zero
|
|
to two.
|
|
"""
|
|
def apply(self, chart, grammar, *edges):
|
|
"""
|
|
Return a generator that will add edges licensed by this rule
|
|
and the given edges to the chart, one at a time. Each
|
|
time the generator is resumed, it will either add a new
|
|
edge and yield that edge; or return.
|
|
|
|
:type edges: list(EdgeI)
|
|
:param edges: A set of existing edges. The number of edges
|
|
that should be passed to ``apply()`` is specified by the
|
|
``NUM_EDGES`` class variable.
|
|
:rtype: iter(EdgeI)
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def apply_everywhere(self, chart, grammar):
|
|
"""
|
|
Return a generator that will add all edges licensed by
|
|
this rule, given the edges that are currently in the
|
|
chart, one at a time. Each time the generator is resumed,
|
|
it will either add a new edge and yield that edge; or return.
|
|
|
|
:rtype: iter(EdgeI)
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class AbstractChartRule(ChartRuleI):
|
|
"""
|
|
An abstract base class for chart rules. ``AbstractChartRule``
|
|
provides:
|
|
|
|
- A default implementation for ``apply``.
|
|
- A default implementation for ``apply_everywhere``,
|
|
(Currently, this implementation assumes that ``NUM_EDGES``<=3.)
|
|
- A default implementation for ``__str__``, which returns a
|
|
name based on the rule's class name.
|
|
"""
|
|
|
|
# Subclasses must define apply.
|
|
def apply(self, chart, grammar, *edges):
|
|
raise NotImplementedError()
|
|
|
|
# Default: loop through the given number of edges, and call
|
|
# self.apply() for each set of edges.
|
|
def apply_everywhere(self, chart, grammar):
|
|
if self.NUM_EDGES == 0:
|
|
for new_edge in self.apply(chart, grammar):
|
|
yield new_edge
|
|
|
|
elif self.NUM_EDGES == 1:
|
|
for e1 in chart:
|
|
for new_edge in self.apply(chart, grammar, e1):
|
|
yield new_edge
|
|
|
|
elif self.NUM_EDGES == 2:
|
|
for e1 in chart:
|
|
for e2 in chart:
|
|
for new_edge in self.apply(chart, grammar, e1, e2):
|
|
yield new_edge
|
|
|
|
elif self.NUM_EDGES == 3:
|
|
for e1 in chart:
|
|
for e2 in chart:
|
|
for e3 in chart:
|
|
for new_edge in self.apply(chart,grammar,e1,e2,e3):
|
|
yield new_edge
|
|
|
|
else:
|
|
raise AssertionError('NUM_EDGES>3 is not currently supported')
|
|
|
|
# Default: return a name based on the class name.
|
|
def __str__(self):
|
|
# Add spaces between InitialCapsWords.
|
|
return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Fundamental Rule
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
class FundamentalRule(AbstractChartRule):
|
|
"""
|
|
A rule that joins two adjacent edges to form a single combined
|
|
edge. In particular, this rule specifies that any pair of edges
|
|
|
|
- ``[A -> alpha \* B beta][i:j]``
|
|
- ``[B -> gamma \*][j:k]``
|
|
|
|
licenses the edge:
|
|
|
|
- ``[A -> alpha B * beta][i:j]``
|
|
"""
|
|
NUM_EDGES = 2
|
|
def apply(self, chart, grammar, left_edge, right_edge):
|
|
# Make sure the rule is applicable.
|
|
if not (left_edge.is_incomplete() and
|
|
right_edge.is_complete() and
|
|
left_edge.end() == right_edge.start() and
|
|
left_edge.nextsym() == right_edge.lhs()):
|
|
return
|
|
|
|
# Construct the new edge.
|
|
new_edge = left_edge.move_dot_forward(right_edge.end())
|
|
|
|
# Insert it into the chart.
|
|
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
|
|
yield new_edge
|
|
|
|
class SingleEdgeFundamentalRule(FundamentalRule):
|
|
"""
|
|
A rule that joins a given edge with adjacent edges in the chart,
|
|
to form combined edges. In particular, this rule specifies that
|
|
either of the edges:
|
|
|
|
- ``[A -> alpha \* B beta][i:j]``
|
|
- ``[B -> gamma \*][j:k]``
|
|
|
|
licenses the edge:
|
|
|
|
- ``[A -> alpha B * beta][i:j]``
|
|
|
|
if the other edge is already in the chart.
|
|
|
|
:note: This is basically ``FundamentalRule``, with one edge left
|
|
unspecified.
|
|
"""
|
|
NUM_EDGES = 1
|
|
|
|
def apply(self, chart, grammar, edge):
|
|
if edge.is_incomplete():
|
|
for new_edge in self._apply_incomplete(chart, grammar, edge):
|
|
yield new_edge
|
|
else:
|
|
for new_edge in self._apply_complete(chart, grammar, edge):
|
|
yield new_edge
|
|
|
|
def _apply_complete(self, chart, grammar, right_edge):
|
|
for left_edge in chart.select(end=right_edge.start(),
|
|
is_complete=False,
|
|
nextsym=right_edge.lhs()):
|
|
new_edge = left_edge.move_dot_forward(right_edge.end())
|
|
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
|
|
yield new_edge
|
|
|
|
def _apply_incomplete(self, chart, grammar, left_edge):
|
|
for right_edge in chart.select(start=left_edge.end(),
|
|
is_complete=True,
|
|
lhs=left_edge.nextsym()):
|
|
new_edge = left_edge.move_dot_forward(right_edge.end())
|
|
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
|
|
yield new_edge
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Inserting Terminal Leafs
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
class LeafInitRule(AbstractChartRule):
|
|
NUM_EDGES=0
|
|
def apply(self, chart, grammar):
|
|
for index in range(chart.num_leaves()):
|
|
new_edge = LeafEdge(chart.leaf(index), index)
|
|
if chart.insert(new_edge, ()):
|
|
yield new_edge
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Top-Down Prediction
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
class TopDownInitRule(AbstractChartRule):
|
|
"""
|
|
A rule licensing edges corresponding to the grammar productions for
|
|
the grammar's start symbol. In particular, this rule specifies that
|
|
``[S -> \* alpha][0:i]`` is licensed for each grammar production
|
|
``S -> alpha``, where ``S`` is the grammar's start symbol.
|
|
"""
|
|
NUM_EDGES = 0
|
|
def apply(self, chart, grammar):
|
|
for prod in grammar.productions(lhs=grammar.start()):
|
|
new_edge = TreeEdge.from_production(prod, 0)
|
|
if chart.insert(new_edge, ()):
|
|
yield new_edge
|
|
|
|
class TopDownPredictRule(AbstractChartRule):
|
|
"""
|
|
A rule licensing edges corresponding to the grammar productions
|
|
for the nonterminal following an incomplete edge's dot. In
|
|
particular, this rule specifies that
|
|
``[A -> alpha \* B beta][i:j]`` licenses the edge
|
|
``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``.
|
|
|
|
:note: This rule corresponds to the Predictor Rule in Earley parsing.
|
|
"""
|
|
NUM_EDGES = 1
|
|
def apply(self, chart, grammar, edge):
|
|
if edge.is_complete(): return
|
|
for prod in grammar.productions(lhs=edge.nextsym()):
|
|
new_edge = TreeEdge.from_production(prod, edge.end())
|
|
if chart.insert(new_edge, ()):
|
|
yield new_edge
|
|
|
|
class CachedTopDownPredictRule(TopDownPredictRule):
|
|
"""
|
|
A cached version of ``TopDownPredictRule``. After the first time
|
|
this rule is applied to an edge with a given ``end`` and ``next``,
|
|
it will not generate any more edges for edges with that ``end`` and
|
|
``next``.
|
|
|
|
If ``chart`` or ``grammar`` are changed, then the cache is flushed.
|
|
"""
|
|
def __init__(self):
|
|
TopDownPredictRule.__init__(self)
|
|
self._done = {}
|
|
|
|
def apply(self, chart, grammar, edge):
|
|
if edge.is_complete(): return
|
|
nextsym, index = edge.nextsym(), edge.end()
|
|
if not is_nonterminal(nextsym): return
|
|
|
|
# If we've already applied this rule to an edge with the same
|
|
# next & end, and the chart & grammar have not changed, then
|
|
# just return (no new edges to add).
|
|
done = self._done.get((nextsym, index), (None,None))
|
|
if done[0] is chart and done[1] is grammar: return
|
|
|
|
# Add all the edges indicated by the top down expand rule.
|
|
for prod in grammar.productions(lhs=nextsym):
|
|
# If the left corner in the predicted production is
|
|
# leaf, it must match with the input.
|
|
if prod.rhs():
|
|
first = prod.rhs()[0]
|
|
if is_terminal(first):
|
|
if index >= chart.num_leaves() or first != chart.leaf(index): continue
|
|
|
|
new_edge = TreeEdge.from_production(prod, index)
|
|
if chart.insert(new_edge, ()):
|
|
yield new_edge
|
|
|
|
# Record the fact that we've applied this rule.
|
|
self._done[nextsym, index] = (chart, grammar)
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Bottom-Up Prediction
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
class BottomUpPredictRule(AbstractChartRule):
|
|
"""
|
|
A rule licensing any edge corresponding to a production whose
|
|
right-hand side begins with a complete edge's left-hand side. In
|
|
particular, this rule specifies that ``[A -> alpha \*]`` licenses
|
|
the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``.
|
|
"""
|
|
NUM_EDGES = 1
|
|
def apply(self, chart, grammar, edge):
|
|
if edge.is_incomplete(): return
|
|
for prod in grammar.productions(rhs=edge.lhs()):
|
|
new_edge = TreeEdge.from_production(prod, edge.start())
|
|
if chart.insert(new_edge, ()):
|
|
yield new_edge
|
|
|
|
class BottomUpPredictCombineRule(BottomUpPredictRule):
|
|
"""
|
|
A rule licensing any edge corresponding to a production whose
|
|
right-hand side begins with a complete edge's left-hand side. In
|
|
particular, this rule specifies that ``[A -> alpha \*]``
|
|
licenses the edge ``[B -> A \* beta]`` for each grammar
|
|
production ``B -> A beta``.
|
|
|
|
:note: This is like ``BottomUpPredictRule``, but it also applies
|
|
the ``FundamentalRule`` to the resulting edge.
|
|
"""
|
|
NUM_EDGES = 1
|
|
def apply(self, chart, grammar, edge):
|
|
if edge.is_incomplete(): return
|
|
for prod in grammar.productions(rhs=edge.lhs()):
|
|
new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
|
|
if chart.insert(new_edge, (edge,)):
|
|
yield new_edge
|
|
|
|
class EmptyPredictRule(AbstractChartRule):
|
|
"""
|
|
A rule that inserts all empty productions as passive edges,
|
|
in every position in the chart.
|
|
"""
|
|
NUM_EDGES = 0
|
|
def apply(self, chart, grammar):
|
|
for prod in grammar.productions(empty=True):
|
|
for index in range(chart.num_leaves() + 1):
|
|
new_edge = TreeEdge.from_production(prod, index)
|
|
if chart.insert(new_edge, ()):
|
|
yield new_edge
|
|
|
|
|
|
########################################################################
|
|
## Filtered Bottom Up
|
|
########################################################################
|
|
|
|
class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
|
|
def _apply_complete(self, chart, grammar, right_edge):
|
|
end = right_edge.end()
|
|
nexttoken = end < chart.num_leaves() and chart.leaf(end)
|
|
for left_edge in chart.select(end=right_edge.start(),
|
|
is_complete=False,
|
|
nextsym=right_edge.lhs()):
|
|
if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
|
|
new_edge = left_edge.move_dot_forward(right_edge.end())
|
|
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
|
|
yield new_edge
|
|
|
|
def _apply_incomplete(self, chart, grammar, left_edge):
|
|
for right_edge in chart.select(start=left_edge.end(),
|
|
is_complete=True,
|
|
lhs=left_edge.nextsym()):
|
|
end = right_edge.end()
|
|
nexttoken = end < chart.num_leaves() and chart.leaf(end)
|
|
if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
|
|
new_edge = left_edge.move_dot_forward(right_edge.end())
|
|
if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
|
|
yield new_edge
|
|
|
|
class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule):
|
|
def apply(self, chart, grammar, edge):
|
|
if edge.is_incomplete():
|
|
return
|
|
|
|
end = edge.end()
|
|
nexttoken = end < chart.num_leaves() and chart.leaf(end)
|
|
for prod in grammar.productions(rhs=edge.lhs()):
|
|
if _bottomup_filter(grammar, nexttoken, prod.rhs()):
|
|
new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
|
|
if chart.insert(new_edge, (edge,)):
|
|
yield new_edge
|
|
|
|
def _bottomup_filter(grammar, nexttoken, rhs, dot=0):
|
|
if len(rhs) <= dot + 1:
|
|
return True
|
|
_next = rhs[dot + 1]
|
|
if is_terminal(_next):
|
|
return nexttoken == _next
|
|
else:
|
|
return grammar.is_leftcorner(_next, nexttoken)
|
|
|
|
|
|
########################################################################
|
|
## Generic Chart Parser
|
|
########################################################################
|
|
|
|
TD_STRATEGY = [LeafInitRule(),
|
|
TopDownInitRule(),
|
|
CachedTopDownPredictRule(),
|
|
SingleEdgeFundamentalRule()]
|
|
BU_STRATEGY = [LeafInitRule(),
|
|
EmptyPredictRule(),
|
|
BottomUpPredictRule(),
|
|
SingleEdgeFundamentalRule()]
|
|
BU_LC_STRATEGY = [LeafInitRule(),
|
|
EmptyPredictRule(),
|
|
BottomUpPredictCombineRule(),
|
|
SingleEdgeFundamentalRule()]
|
|
|
|
LC_STRATEGY = [LeafInitRule(),
|
|
FilteredBottomUpPredictCombineRule(),
|
|
FilteredSingleEdgeFundamentalRule()]
|
|
|
|
class ChartParser(ParserI):
|
|
"""
|
|
A generic chart parser. A "strategy", or list of
|
|
``ChartRuleI`` instances, is used to decide what edges to add to
|
|
the chart. In particular, ``ChartParser`` uses the following
|
|
algorithm to parse texts:
|
|
|
|
| Until no new edges are added:
|
|
| For each *rule* in *strategy*:
|
|
| Apply *rule* to any applicable edges in the chart.
|
|
| Return any complete parses in the chart
|
|
"""
|
|
def __init__(self, grammar, strategy=BU_LC_STRATEGY, trace=0,
|
|
trace_chart_width=50, use_agenda=True, chart_class=Chart):
|
|
"""
|
|
Create a new chart parser, that uses ``grammar`` to parse
|
|
texts.
|
|
|
|
:type grammar: CFG
|
|
:param grammar: The grammar used to parse texts.
|
|
:type strategy: list(ChartRuleI)
|
|
:param strategy: A list of rules that should be used to decide
|
|
what edges to add to the chart (top-down strategy by default).
|
|
:type trace: int
|
|
:param trace: The level of tracing that should be used when
|
|
parsing a text. ``0`` will generate no tracing output;
|
|
and higher numbers will produce more verbose tracing
|
|
output.
|
|
:type trace_chart_width: int
|
|
:param trace_chart_width: The default total width reserved for
|
|
the chart in trace output. The remainder of each line will
|
|
be used to display edges.
|
|
:type use_agenda: bool
|
|
:param use_agenda: Use an optimized agenda-based algorithm,
|
|
if possible.
|
|
:param chart_class: The class that should be used to create
|
|
the parse charts.
|
|
"""
|
|
self._grammar = grammar
|
|
self._strategy = strategy
|
|
self._trace = trace
|
|
self._trace_chart_width = trace_chart_width
|
|
# If the strategy only consists of axioms (NUM_EDGES==0) and
|
|
# inference rules (NUM_EDGES==1), we can use an agenda-based algorithm:
|
|
self._use_agenda = use_agenda
|
|
self._chart_class = chart_class
|
|
|
|
self._axioms = []
|
|
self._inference_rules = []
|
|
for rule in strategy:
|
|
if rule.NUM_EDGES == 0:
|
|
self._axioms.append(rule)
|
|
elif rule.NUM_EDGES == 1:
|
|
self._inference_rules.append(rule)
|
|
else:
|
|
self._use_agenda = False
|
|
|
|
def grammar(self):
|
|
return self._grammar
|
|
|
|
def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width):
|
|
if not trace: return
|
|
print_rule_header = trace > 1
|
|
for edge in new_edges:
|
|
if print_rule_header:
|
|
print('%s:' % rule)
|
|
print_rule_header = False
|
|
print(chart.pretty_format_edge(edge, edge_width))
|
|
|
|
def chart_parse(self, tokens, trace=None):
|
|
"""
|
|
Return the final parse ``Chart`` from which all possible
|
|
parse trees can be extracted.
|
|
|
|
:param tokens: The sentence to be parsed
|
|
:type tokens: list(str)
|
|
:rtype: Chart
|
|
"""
|
|
if trace is None: trace = self._trace
|
|
trace_new_edges = self._trace_new_edges
|
|
|
|
tokens = list(tokens)
|
|
self._grammar.check_coverage(tokens)
|
|
chart = self._chart_class(tokens)
|
|
grammar = self._grammar
|
|
|
|
# Width, for printing trace edges.
|
|
trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
|
|
if trace: print(chart.pretty_format_leaves(trace_edge_width))
|
|
|
|
if self._use_agenda:
|
|
# Use an agenda-based algorithm.
|
|
for axiom in self._axioms:
|
|
new_edges = list(axiom.apply(chart, grammar))
|
|
trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
|
|
|
|
inference_rules = self._inference_rules
|
|
agenda = chart.edges()
|
|
# We reverse the initial agenda, since it is a stack
|
|
# but chart.edges() functions as a queue.
|
|
agenda.reverse()
|
|
while agenda:
|
|
edge = agenda.pop()
|
|
for rule in inference_rules:
|
|
new_edges = list(rule.apply(chart, grammar, edge))
|
|
if trace:
|
|
trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
|
|
agenda += new_edges
|
|
|
|
else:
|
|
# Do not use an agenda-based algorithm.
|
|
edges_added = True
|
|
while edges_added:
|
|
edges_added = False
|
|
for rule in self._strategy:
|
|
new_edges = list(rule.apply_everywhere(chart, grammar))
|
|
edges_added = len(new_edges)
|
|
trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
|
|
|
|
# Return the final chart.
|
|
return chart
|
|
|
|
def parse(self, tokens, tree_class=Tree):
|
|
chart = self.chart_parse(tokens)
|
|
return iter(chart.parses(self._grammar.start(), tree_class=tree_class))
|
|
|
|
class TopDownChartParser(ChartParser):
|
|
"""
|
|
A ``ChartParser`` using a top-down parsing strategy.
|
|
See ``ChartParser`` for more information.
|
|
"""
|
|
def __init__(self, grammar, **parser_args):
|
|
ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args)
|
|
|
|
class BottomUpChartParser(ChartParser):
|
|
"""
|
|
A ``ChartParser`` using a bottom-up parsing strategy.
|
|
See ``ChartParser`` for more information.
|
|
"""
|
|
def __init__(self, grammar, **parser_args):
|
|
if isinstance(grammar, PCFG):
|
|
warnings.warn("BottomUpChartParser only works for CFG, "
|
|
"use BottomUpProbabilisticChartParser instead",
|
|
category=DeprecationWarning)
|
|
ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args)
|
|
|
|
class BottomUpLeftCornerChartParser(ChartParser):
|
|
"""
|
|
A ``ChartParser`` using a bottom-up left-corner parsing strategy.
|
|
This strategy is often more efficient than standard bottom-up.
|
|
See ``ChartParser`` for more information.
|
|
"""
|
|
def __init__(self, grammar, **parser_args):
|
|
ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args)
|
|
|
|
class LeftCornerChartParser(ChartParser):
|
|
def __init__(self, grammar, **parser_args):
|
|
if not grammar.is_nonempty():
|
|
raise ValueError("LeftCornerParser only works for grammars "
|
|
"without empty productions.")
|
|
ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args)
|
|
|
|
########################################################################
|
|
## Stepping Chart Parser
|
|
########################################################################
|
|
|
|
class SteppingChartParser(ChartParser):
|
|
"""
|
|
A ``ChartParser`` that allows you to step through the parsing
|
|
process, adding a single edge at a time. It also allows you to
|
|
change the parser's strategy or grammar midway through parsing a
|
|
text.
|
|
|
|
The ``initialize`` method is used to start parsing a text. ``step``
|
|
adds a single edge to the chart. ``set_strategy`` changes the
|
|
strategy used by the chart parser. ``parses`` returns the set of
|
|
parses that has been found by the chart parser.
|
|
|
|
:ivar _restart: Records whether the parser's strategy, grammar,
|
|
or chart has been changed. If so, then ``step`` must restart
|
|
the parsing algorithm.
|
|
"""
|
|
def __init__(self, grammar, strategy=[], trace=0):
|
|
self._chart = None
|
|
self._current_chartrule = None
|
|
self._restart = False
|
|
ChartParser.__init__(self, grammar, strategy, trace)
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Initialization
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def initialize(self, tokens):
|
|
"Begin parsing the given tokens."
|
|
self._chart = Chart(list(tokens))
|
|
self._restart = True
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Stepping
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def step(self):
|
|
"""
|
|
Return a generator that adds edges to the chart, one at a
|
|
time. Each time the generator is resumed, it adds a single
|
|
edge and yields that edge. If no more edges can be added,
|
|
then it yields None.
|
|
|
|
If the parser's strategy, grammar, or chart is changed, then
|
|
the generator will continue adding edges using the new
|
|
strategy, grammar, or chart.
|
|
|
|
Note that this generator never terminates, since the grammar
|
|
or strategy might be changed to values that would add new
|
|
edges. Instead, it yields None when no more edges can be
|
|
added with the current strategy and grammar.
|
|
"""
|
|
if self._chart is None:
|
|
raise ValueError('Parser must be initialized first')
|
|
while True:
|
|
self._restart = False
|
|
w = 50 // (self._chart.num_leaves()+1)
|
|
|
|
for e in self._parse():
|
|
if self._trace > 1: print(self._current_chartrule)
|
|
if self._trace > 0: print(self._chart.pretty_format_edge(e,w))
|
|
yield e
|
|
if self._restart: break
|
|
else:
|
|
yield None # No more edges.
|
|
|
|
def _parse(self):
|
|
"""
|
|
A generator that implements the actual parsing algorithm.
|
|
``step`` iterates through this generator, and restarts it
|
|
whenever the parser's strategy, grammar, or chart is modified.
|
|
"""
|
|
chart = self._chart
|
|
grammar = self._grammar
|
|
edges_added = 1
|
|
while edges_added > 0:
|
|
edges_added = 0
|
|
for rule in self._strategy:
|
|
self._current_chartrule = rule
|
|
for e in rule.apply_everywhere(chart, grammar):
|
|
edges_added += 1
|
|
yield e
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Accessors
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def strategy(self):
|
|
"Return the strategy used by this parser."
|
|
return self._strategy
|
|
|
|
def grammar(self):
|
|
"Return the grammar used by this parser."
|
|
return self._grammar
|
|
|
|
def chart(self):
|
|
"Return the chart that is used by this parser."
|
|
return self._chart
|
|
|
|
def current_chartrule(self):
|
|
"Return the chart rule used to generate the most recent edge."
|
|
return self._current_chartrule
|
|
|
|
def parses(self, tree_class=Tree):
|
|
"Return the parse trees currently contained in the chart."
|
|
return self._chart.parses(self._grammar.start(), tree_class)
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Parser modification
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def set_strategy(self, strategy):
|
|
"""
|
|
Change the strategy that the parser uses to decide which edges
|
|
to add to the chart.
|
|
|
|
:type strategy: list(ChartRuleI)
|
|
:param strategy: A list of rules that should be used to decide
|
|
what edges to add to the chart.
|
|
"""
|
|
if strategy == self._strategy: return
|
|
self._strategy = strategy[:] # Make a copy.
|
|
self._restart = True
|
|
|
|
def set_grammar(self, grammar):
|
|
"Change the grammar used by the parser."
|
|
if grammar is self._grammar: return
|
|
self._grammar = grammar
|
|
self._restart = True
|
|
|
|
def set_chart(self, chart):
|
|
"Load a given chart into the chart parser."
|
|
if chart is self._chart: return
|
|
self._chart = chart
|
|
self._restart = True
|
|
|
|
#////////////////////////////////////////////////////////////
|
|
# Standard parser methods
|
|
#////////////////////////////////////////////////////////////
|
|
|
|
def parse(self, tokens, tree_class=Tree):
|
|
tokens = list(tokens)
|
|
self._grammar.check_coverage(tokens)
|
|
|
|
# Initialize ourselves.
|
|
self.initialize(tokens)
|
|
|
|
# Step until no more edges are generated.
|
|
for e in self.step():
|
|
if e is None: break
|
|
|
|
# Return an iterator of complete parses.
|
|
return self.parses(tree_class=tree_class)
|
|
|
|
########################################################################
|
|
## Demo Code
|
|
########################################################################
|
|
|
|
def demo_grammar():
|
|
from nltk.grammar import CFG
|
|
return CFG.fromstring("""
|
|
S -> NP VP
|
|
PP -> "with" NP
|
|
NP -> NP PP
|
|
VP -> VP PP
|
|
VP -> Verb NP
|
|
VP -> Verb
|
|
NP -> Det Noun
|
|
NP -> "John"
|
|
NP -> "I"
|
|
Det -> "the"
|
|
Det -> "my"
|
|
Det -> "a"
|
|
Noun -> "dog"
|
|
Noun -> "cookie"
|
|
Verb -> "ate"
|
|
Verb -> "saw"
|
|
Prep -> "with"
|
|
Prep -> "under"
|
|
""")
|
|
|
|
def demo(choice=None,
|
|
print_times=True, print_grammar=False,
|
|
print_trees=True, trace=2,
|
|
sent='I saw John with a dog with my cookie', numparses=5):
|
|
"""
|
|
A demonstration of the chart parsers.
|
|
"""
|
|
import sys, time
|
|
from nltk import nonterminals, Production, CFG
|
|
|
|
# The grammar for ChartParser and SteppingChartParser:
|
|
grammar = demo_grammar()
|
|
if print_grammar:
|
|
print("* Grammar")
|
|
print(grammar)
|
|
|
|
# Tokenize the sample sentence.
|
|
print("* Sentence:")
|
|
print(sent)
|
|
tokens = sent.split()
|
|
print(tokens)
|
|
print()
|
|
|
|
# Ask the user which parser to test,
|
|
# if the parser wasn't provided as an argument
|
|
if choice is None:
|
|
print(' 1: Top-down chart parser')
|
|
print(' 2: Bottom-up chart parser')
|
|
print(' 3: Bottom-up left-corner chart parser')
|
|
print(' 4: Left-corner chart parser with bottom-up filter')
|
|
print(' 5: Stepping chart parser (alternating top-down & bottom-up)')
|
|
print(' 6: All parsers')
|
|
print('\nWhich parser (1-6)? ', end=' ')
|
|
choice = sys.stdin.readline().strip()
|
|
print()
|
|
|
|
choice = str(choice)
|
|
if choice not in "123456":
|
|
print('Bad parser number')
|
|
return
|
|
|
|
# Keep track of how long each parser takes.
|
|
times = {}
|
|
|
|
strategies = {'1': ('Top-down', TD_STRATEGY),
|
|
'2': ('Bottom-up', BU_STRATEGY),
|
|
'3': ('Bottom-up left-corner', BU_LC_STRATEGY),
|
|
'4': ('Filtered left-corner', LC_STRATEGY)}
|
|
choices = []
|
|
if choice in strategies: choices = [choice]
|
|
if choice=='6': choices = "1234"
|
|
|
|
# Run the requested chart parser(s), except the stepping parser.
|
|
for strategy in choices:
|
|
print("* Strategy: " + strategies[strategy][0])
|
|
print()
|
|
cp = ChartParser(grammar, strategies[strategy][1], trace=trace)
|
|
t = time.time()
|
|
chart = cp.chart_parse(tokens)
|
|
parses = list(chart.parses(grammar.start()))
|
|
|
|
times[strategies[strategy][0]] = time.time()-t
|
|
print("Nr edges in chart:", len(chart.edges()))
|
|
if numparses:
|
|
assert len(parses)==numparses, 'Not all parses found'
|
|
if print_trees:
|
|
for tree in parses: print(tree)
|
|
else:
|
|
print("Nr trees:", len(parses))
|
|
print()
|
|
|
|
# Run the stepping parser, if requested.
|
|
if choice in "56":
|
|
print("* Strategy: Stepping (top-down vs bottom-up)")
|
|
print()
|
|
t = time.time()
|
|
cp = SteppingChartParser(grammar, trace=trace)
|
|
cp.initialize(tokens)
|
|
for i in range(5):
|
|
print('*** SWITCH TO TOP DOWN')
|
|
cp.set_strategy(TD_STRATEGY)
|
|
for j, e in enumerate(cp.step()):
|
|
if j>20 or e is None: break
|
|
print('*** SWITCH TO BOTTOM UP')
|
|
cp.set_strategy(BU_STRATEGY)
|
|
for j, e in enumerate(cp.step()):
|
|
if j>20 or e is None: break
|
|
times['Stepping'] = time.time()-t
|
|
print("Nr edges in chart:", len(cp.chart().edges()))
|
|
if numparses:
|
|
assert len(list(cp.parses()))==numparses, 'Not all parses found'
|
|
if print_trees:
|
|
for tree in cp.parses(): print(tree)
|
|
else:
|
|
print("Nr trees:", len(list(cp.parses())))
|
|
print()
|
|
|
|
# Print the times of all parsers:
|
|
if not (print_times and times): return
|
|
print("* Parsing times")
|
|
print()
|
|
maxlen = max(len(key) for key in times)
|
|
format = '%' + repr(maxlen) + 's parser: %6.3fsec'
|
|
times_items = times.items()
|
|
for (parser, t) in sorted(times_items, key=lambda a:a[1]):
|
|
print(format % (parser, t))
|
|
|
|
if __name__ == '__main__':
|
|
demo()
|