629 lines
30 KiB
Python
629 lines
30 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Natural Language Toolkit: TGrep search
|
||
|
#
|
||
|
# Copyright (C) 2001-2018 NLTK Project
|
||
|
# Author: Will Roberts <wildwilhelm@gmail.com>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
'''
|
||
|
Unit tests for nltk.tgrep.
|
||
|
'''
|
||
|
|
||
|
from __future__ import absolute_import, print_function, unicode_literals
|
||
|
|
||
|
from six import b
|
||
|
|
||
|
from nltk.tree import ParentedTree
|
||
|
from nltk import tgrep
|
||
|
import unittest
|
||
|
|
||
|
class TestSequenceFunctions(unittest.TestCase):
|
||
|
|
||
|
'''
|
||
|
Class containing unit tests for nltk.tgrep.
|
||
|
'''
|
||
|
|
||
|
def test_tokenize_simple(self):
|
||
|
'''
|
||
|
Simple test of tokenization.
|
||
|
'''
|
||
|
tokens = tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]')
|
||
|
self.assertEqual(tokens,
|
||
|
['A', '..', '(', 'B', '!', '<', 'C', '.', 'D', ')',
|
||
|
'|', '!', '[', '<<', '(', 'E', ',', 'F', ')', '$',
|
||
|
'G', ']'])
|
||
|
|
||
|
def test_tokenize_encoding(self):
|
||
|
'''
|
||
|
Test that tokenization handles bytes and strs the same way.
|
||
|
'''
|
||
|
self.assertEqual(
|
||
|
tgrep.tgrep_tokenize(b('A .. (B !< C . D) | ![<< (E , F) $ G]')),
|
||
|
tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]'))
|
||
|
|
||
|
def test_tokenize_link_types(self):
|
||
|
'''
|
||
|
Test tokenization of basic link types.
|
||
|
'''
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<B'), ['A', '<', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>B'), ['A', '>', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<3B'), ['A', '<3', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>3B'), ['A', '>3', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<,B'), ['A', '<,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>,B'), ['A', '>,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<-3B'), ['A', '<-3', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>-3B'), ['A', '>-3', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<-B'), ['A', '<-', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>-B'), ['A', '>-', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<\'B'), ['A', '<\'', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>\'B'), ['A', '>\'', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<:B'), ['A', '<:', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>:B'), ['A', '>:', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<<B'), ['A', '<<', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>>B'), ['A', '>>', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<<,B'), ['A', '<<,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>>,B'), ['A', '>>,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<<\'B'), ['A', '<<\'', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>>\'B'), ['A', '>>\'', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<<:B'), ['A', '<<:', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A>>:B'), ['A', '>>:', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A.B'), ['A', '.', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A,B'), ['A', ',', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A..B'), ['A', '..', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A,,B'), ['A', ',,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A$B'), ['A', '$', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A$.B'), ['A', '$.', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A$,B'), ['A', '$,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A$..B'), ['A', '$..', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A$,,B'), ['A', '$,,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<B'), ['A', '!', '<', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>B'), ['A', '!', '>', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<3B'), ['A', '!', '<3', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>3B'), ['A', '!', '>3', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<,B'), ['A', '!', '<,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>,B'), ['A', '!', '>,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<-3B'),
|
||
|
['A', '!', '<-3', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>-3B'),
|
||
|
['A', '!', '>-3', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<-B'), ['A', '!', '<-', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>-B'), ['A', '!', '>-', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<\'B'),
|
||
|
['A', '!', '<\'', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>\'B'),
|
||
|
['A', '!', '>\'', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<:B'), ['A', '!', '<:', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>:B'), ['A', '!', '>:', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<<B'), ['A', '!', '<<', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>>B'), ['A', '!', '>>', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<<,B'),
|
||
|
['A', '!', '<<,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>>,B'),
|
||
|
['A', '!', '>>,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<<\'B'),
|
||
|
['A', '!', '<<\'', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>>\'B'),
|
||
|
['A', '!', '>>\'', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!<<:B'),
|
||
|
['A', '!', '<<:', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!>>:B'),
|
||
|
['A', '!', '>>:', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!.B'), ['A', '!', '.', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!,B'), ['A', '!', ',', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!..B'), ['A', '!', '..', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!,,B'), ['A', '!', ',,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!$B'), ['A', '!', '$', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!$.B'), ['A', '!', '$.', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!$,B'), ['A', '!', '$,', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!$..B'),
|
||
|
['A', '!', '$..', 'B'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A!$,,B'),
|
||
|
['A', '!', '$,,', 'B'])
|
||
|
|
||
|
def test_tokenize_examples(self):
|
||
|
'''
|
||
|
Test tokenization of the TGrep2 manual example patterns.
|
||
|
'''
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('NP < PP'),
|
||
|
['NP', '<', 'PP'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('/^NP/'),
|
||
|
['/^NP/'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('NP << PP . VP'),
|
||
|
['NP', '<<', 'PP', '.', 'VP'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('NP << PP | . VP'),
|
||
|
['NP', '<<', 'PP', '|', '.', 'VP'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('NP !<< PP [> NP | >> VP]'),
|
||
|
['NP', '!', '<<', 'PP', '[', '>', 'NP', '|',
|
||
|
'>>', 'VP', ']'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('NP << (PP . VP)'),
|
||
|
['NP', '<<', '(', 'PP', '.', 'VP', ')'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('NP <\' (PP <, (IN < on))'),
|
||
|
['NP', '<\'', '(', 'PP', '<,', '(', 'IN', '<',
|
||
|
'on', ')', ')'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('S < (A < B) < C'),
|
||
|
['S', '<', '(', 'A', '<', 'B', ')', '<', 'C'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('S < ((A < B) < C)'),
|
||
|
['S', '<', '(', '(', 'A', '<', 'B', ')',
|
||
|
'<', 'C', ')'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('S < (A < B < C)'),
|
||
|
['S', '<', '(', 'A', '<', 'B', '<', 'C', ')'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('A<B&.C'),
|
||
|
['A', '<', 'B', '&', '.', 'C'])
|
||
|
|
||
|
def test_tokenize_quoting(self):
|
||
|
'''
|
||
|
Test tokenization of quoting.
|
||
|
'''
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
|
||
|
['"A<<:B"', '<<:', '"A $.. B"', '<', '"A>3B"',
|
||
|
'<', 'C'])
|
||
|
|
||
|
def test_tokenize_nodenames(self):
|
||
|
'''
|
||
|
Test tokenization of node names.
|
||
|
'''
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('Robert'), ['Robert'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('/^[Bb]ob/'), ['/^[Bb]ob/'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('*'), ['*'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('__'), ['__'])
|
||
|
# test tokenization of NLTK tree position syntax
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('N()'),
|
||
|
['N(', ')'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('N(0,)'),
|
||
|
['N(', '0', ',', ')'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('N(0,0)'),
|
||
|
['N(', '0', ',', '0', ')'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize('N(0,0,)'),
|
||
|
['N(', '0', ',', '0', ',', ')'])
|
||
|
|
||
|
def test_tokenize_macros(self):
|
||
|
'''
|
||
|
Test tokenization of macro definitions.
|
||
|
'''
|
||
|
self.assertEqual(tgrep.tgrep_tokenize(
|
||
|
'@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN'),
|
||
|
['@', 'NP', '/^NP/', ';', '@', 'NN', '/^NN/', ';',
|
||
|
'@NP', '[', '!', '<', 'NP', '|', '<', '@NN', ']',
|
||
|
'!', '$..', '@NN'])
|
||
|
|
||
|
def test_node_simple(self):
|
||
|
'''
|
||
|
Test a simple use of tgrep for finding nodes matching a given
|
||
|
pattern.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring(
|
||
|
'(S (NP (DT the) (JJ big) (NN dog)) '
|
||
|
'(VP bit) (NP (DT a) (NN cat)))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])),
|
||
|
[[(0,2), (2,1)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_nodes('NN', [tree])),
|
||
|
[[tree[0,2], tree[2,1]]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('NN|JJ', [tree])),
|
||
|
[[(0, 1), (0, 2), (2, 1)]])
|
||
|
|
||
|
def test_node_printing(self):
|
||
|
'''Test that the tgrep print operator ' is properly ignored.'''
|
||
|
tree = ParentedTree.fromstring('(S (n x) (N x))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('N', [tree])),
|
||
|
list(tgrep.tgrep_positions('\'N', [tree])))
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('/[Nn]/', [tree])),
|
||
|
list(tgrep.tgrep_positions('\'/[Nn]/', [tree])))
|
||
|
|
||
|
def test_node_encoding(self):
|
||
|
'''
|
||
|
Test that tgrep search strings handles bytes and strs the same
|
||
|
way.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring(
|
||
|
'(S (NP (DT the) (JJ big) (NN dog)) '
|
||
|
'(VP bit) (NP (DT a) (NN cat)))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions(b('NN'), [tree])),
|
||
|
list(tgrep.tgrep_positions('NN', [tree])))
|
||
|
self.assertEqual(list(tgrep.tgrep_nodes(b('NN'), [tree])),
|
||
|
list(tgrep.tgrep_nodes('NN', [tree])))
|
||
|
self.assertEqual(list(tgrep.tgrep_positions(b('NN|JJ'), [tree])),
|
||
|
list(tgrep.tgrep_positions('NN|JJ', [tree])))
|
||
|
|
||
|
def test_node_nocase(self):
|
||
|
'''
|
||
|
Test selecting nodes using case insensitive node names.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (n x) (N x))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
|
||
|
|
||
|
def test_node_quoted(self):
|
||
|
'''
|
||
|
Test selecting nodes using quoted node names.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
|
||
|
|
||
|
def test_node_regex(self):
|
||
|
'''
|
||
|
Test regex matching on nodes.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
|
||
|
# This is a regular expression that matches any node whose
|
||
|
# name starts with NP, including NP-SBJ:
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])),
|
||
|
[[(0,), (1,)]])
|
||
|
|
||
|
def test_node_regex_2(self):
|
||
|
'''
|
||
|
Test regex matching on nodes.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])),
|
||
|
[[(0,), (1,)]])
|
||
|
# This is a regular expression that matches any node whose
|
||
|
# name includes SBJ, including NP-SBJ:
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('/SBJ/', [tree])),
|
||
|
[[(0,), (1,), (2,)]])
|
||
|
|
||
|
def test_node_tree_position(self):
|
||
|
'''
|
||
|
Test matching on nodes based on NLTK tree position.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
|
||
|
# test all tree positions that are not leaves
|
||
|
leaf_positions = set([tree.leaf_treeposition(x)
|
||
|
for x in range(len(tree.leaves()))])
|
||
|
tree_positions = [x for x in tree.treepositions()
|
||
|
if x not in leaf_positions]
|
||
|
for position in tree_positions:
|
||
|
node_id = 'N{0}'.format(position)
|
||
|
tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
|
||
|
self.assertEqual(len(tgrep_positions[0]), 1)
|
||
|
self.assertEqual(tgrep_positions[0][0], position)
|
||
|
|
||
|
def test_node_noleaves(self):
|
||
|
'''
|
||
|
Test node name matching with the search_leaves flag set to False.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('x', [tree])),
|
||
|
[[(0, 0, 0), (1, 0, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)),
|
||
|
[[]])
|
||
|
|
||
|
def tests_rel_dominance(self):
|
||
|
'''
|
||
|
Test matching nodes based on dominance relations.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])),
|
||
|
[[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* < T > S', [tree])),
|
||
|
[[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* !< T', [tree])),
|
||
|
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* !< T > S', [tree])),
|
||
|
[[(1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* > A', [tree])),
|
||
|
[[(0, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* > B', [tree])),
|
||
|
[[(1, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* !> B', [tree])),
|
||
|
[[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* !> B >> S', [tree])),
|
||
|
[[(0,), (0, 0), (1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >> S', [tree])),
|
||
|
[[(0,), (0, 0), (1,), (1, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >>, S', [tree])),
|
||
|
[[(0,), (0, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >>\' S', [tree])),
|
||
|
[[(1,), (1, 0)]])
|
||
|
# Known issue:
|
||
|
#self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
|
||
|
# [[()]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* << T', [tree])),
|
||
|
[[(), (0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <<\' T', [tree])),
|
||
|
[[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <<1 N', [tree])),
|
||
|
[[(1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* !<< T', [tree])),
|
||
|
[[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]])
|
||
|
tree = ParentedTree.fromstring('(S (A (T x)) (B (T x) (N x )))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <: T', [tree])),
|
||
|
[[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])),
|
||
|
[[(0,), (1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* !<: T', [tree])),
|
||
|
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0),
|
||
|
(1, 1), (1, 1, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* !<: T > S', [tree])),
|
||
|
[[(1,)]])
|
||
|
tree = ParentedTree.fromstring('(S (T (A x) (B x)) (T (C x)))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >: T', [tree])),
|
||
|
[[(1, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* !>: T', [tree])),
|
||
|
[[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0),
|
||
|
(1,), (1, 0, 0)]])
|
||
|
tree = ParentedTree.fromstring('(S (A (B (C (D (E (T x))))))'
|
||
|
' (A (B (C (D (E (T x))) (N x)))))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <<: T', [tree])),
|
||
|
[[(0,), (0, 0), (0, 0, 0), (0, 0, 0, 0),
|
||
|
(0, 0, 0, 0, 0), (1, 0, 0, 0), (1, 0, 0, 0, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >>: A', [tree])),
|
||
|
[[(0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0),
|
||
|
(0, 0, 0, 0, 0, 0), (1, 0), (1, 0, 0)]])
|
||
|
|
||
|
def test_bad_operator(self):
|
||
|
'''
|
||
|
Test error handling of undefined tgrep operators.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
|
||
|
self.assertRaises(
|
||
|
tgrep.TgrepException,
|
||
|
list,
|
||
|
tgrep.tgrep_positions('* >>> S', [tree]))
|
||
|
|
||
|
def test_comments(self):
|
||
|
'''
|
||
|
Test that comments are correctly filtered out of tgrep search
|
||
|
strings.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))')
|
||
|
search1 = '''
|
||
|
@ NP /^NP/;
|
||
|
@ NN /^NN/;
|
||
|
@NN
|
||
|
'''
|
||
|
self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])),
|
||
|
[[(0,), (2,)]])
|
||
|
search2 = '''
|
||
|
# macros
|
||
|
@ NP /^NP/;
|
||
|
@ NN /^NN/;
|
||
|
|
||
|
# search string
|
||
|
@NN
|
||
|
'''
|
||
|
self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])),
|
||
|
[[(0,), (2,)]])
|
||
|
|
||
|
def test_rel_sister_nodes(self):
|
||
|
'''
|
||
|
Test matching sister nodes in a tree.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* $. B', [tree])), [[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* $.. B', [tree])), [[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* $, B', [tree])), [[(2,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* $,, B', [tree])), [[(2,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* $ B', [tree])), [[(0,), (2,)]])
|
||
|
|
||
|
def tests_rel_indexed_children(self):
|
||
|
'''
|
||
|
Test matching nodes based on their index in their parent node.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >, S', [tree])), [[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >1 S', [tree])), [[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >2 S', [tree])), [[(1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >3 S', [tree])), [[(2,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >\' S', [tree])), [[(2,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >-1 S', [tree])), [[(2,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >-2 S', [tree])), [[(1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* >-3 S', [tree])), [[(0,)]])
|
||
|
tree = ParentedTree.fromstring(
|
||
|
'(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) '
|
||
|
'(F (C x) (A x) (B x)))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <, A', [tree])), [[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <1 A', [tree])), [[(0,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <2 A', [tree])), [[(2,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <3 A', [tree])), [[(1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <\' A', [tree])), [[(1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <-1 A', [tree])), [[(1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <-2 A', [tree])), [[(2,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* <-3 A', [tree])), [[(0,)]])
|
||
|
|
||
|
def test_rel_precedence(self):
|
||
|
'''
|
||
|
Test matching nodes based on precedence relations.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))'
|
||
|
' (VP (AP (X (PP x)) (Y (AP x))))'
|
||
|
' (NP (RC (NP (AP x)))))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* . X', [tree])),
|
||
|
[[(0,), (0, 1), (0, 1, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* . Y', [tree])),
|
||
|
[[(1, 0, 0), (1, 0, 0, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* .. X', [tree])),
|
||
|
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* .. Y', [tree])),
|
||
|
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0),
|
||
|
(1, 0, 0), (1, 0, 0, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* , X', [tree])),
|
||
|
[[(1, 0, 1), (1, 0, 1, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* , Y', [tree])),
|
||
|
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* ,, X', [tree])),
|
||
|
[[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0),
|
||
|
(2, 0, 0, 0)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('* ,, Y', [tree])),
|
||
|
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
|
||
|
|
||
|
def test_examples(self):
|
||
|
'''
|
||
|
Test the Basic Examples from the TGrep2 manual.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)))')
|
||
|
# This matches any NP node that immediately dominates a PP:
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('NP < PP', [tree])),
|
||
|
[[(1,)]])
|
||
|
|
||
|
tree = ParentedTree.fromstring('(S (NP x) (VP x) (NP (PP x)) (VP x))')
|
||
|
# This matches an NP that dominates a PP and is immediately
|
||
|
# followed by a VP:
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('NP << PP . VP', [tree])),
|
||
|
[[(2,)]])
|
||
|
|
||
|
tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)) '
|
||
|
'(NP (DET x) (NN x)) (VP x))')
|
||
|
# This matches an NP that dominates a PP or is immediately
|
||
|
# followed by a VP:
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('NP << PP | . VP', [tree])),
|
||
|
[[(1,), (2,)]])
|
||
|
|
||
|
tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))'
|
||
|
' (VP (AP (NP (PP x)) (NP (AP x))))'
|
||
|
' (NP (RC (NP (AP x)))))')
|
||
|
# This matches an NP that does not dominate a PP. Also, the NP
|
||
|
# must either have a parent that is an NP or be dominated by a
|
||
|
# VP:
|
||
|
self.assertEqual(list(tgrep.tgrep_positions(
|
||
|
'NP !<< PP [> NP | >> VP]', [tree])),
|
||
|
[[(0, 1), (1, 0, 1)]])
|
||
|
|
||
|
tree = ParentedTree.fromstring('(S (NP (AP (PP x) (VP x))) '
|
||
|
'(NP (AP (PP x) (NP x))) (NP x))')
|
||
|
# This matches an NP that dominates a PP which itself is
|
||
|
# immediately followed by a VP. Note the use of parentheses to
|
||
|
# group ". VP" with the PP rather than with the NP:
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('NP << (PP . VP)', [tree])),
|
||
|
[[(0,)]])
|
||
|
|
||
|
tree = ParentedTree.fromstring(
|
||
|
'(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))'
|
||
|
' (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))'
|
||
|
' (NP x))')
|
||
|
# This matches an NP whose last child is a PP that begins with
|
||
|
# the preposition "on":
|
||
|
self.assertEqual(list(tgrep.tgrep_positions(
|
||
|
'NP <\' (PP <, (IN < on))', [tree])),
|
||
|
[[(0,)]])
|
||
|
|
||
|
tree = ParentedTree.fromstring(
|
||
|
'(S (S (C x) (A (B x))) (S (C x) (A x)) '
|
||
|
'(S (D x) (A (B x))))')
|
||
|
# The following pattern matches an S which has a child A and
|
||
|
# another child that is a C and that the A has a child B:
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('S < (A < B) < C', [tree])),
|
||
|
[[(0,)]])
|
||
|
|
||
|
tree = ParentedTree.fromstring(
|
||
|
'(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))')
|
||
|
# However, this pattern means that S has child A and that A
|
||
|
# has children B and C:
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('S < ((A < B) < C)', [tree])),
|
||
|
[[(0,)]])
|
||
|
|
||
|
# It is equivalent to this:
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('S < (A < B < C)', [tree])),
|
||
|
[[(0,)]])
|
||
|
|
||
|
def test_use_macros(self):
|
||
|
'''
|
||
|
Test defining and using tgrep2 macros.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring(
|
||
|
'(VP (VB sold) (NP (DET the) '
|
||
|
'(NN heiress)) (NP (NN deed) (PREP to) '
|
||
|
'(NP (DET the) (NN school) (NN house))))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions(
|
||
|
'@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN',
|
||
|
[tree])),
|
||
|
[[(1,), (2, 2)]])
|
||
|
# use undefined macro @CNP
|
||
|
self.assertRaises(
|
||
|
tgrep.TgrepException,
|
||
|
list,
|
||
|
tgrep.tgrep_positions(
|
||
|
'@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree]))
|
||
|
|
||
|
def test_tokenize_node_labels(self):
|
||
|
'''Test tokenization of labeled nodes.'''
|
||
|
self.assertEqual(tgrep.tgrep_tokenize(
|
||
|
'S < @SBJ < (@VP < (@VB $.. @OBJ))'),
|
||
|
['S', '<', '@SBJ', '<', '(', '@VP', '<', '(',
|
||
|
'@VB', '$..', '@OBJ', ')', ')'])
|
||
|
self.assertEqual(tgrep.tgrep_tokenize(
|
||
|
'S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))'),
|
||
|
['S', '<', '@SBJ', '=', 's', '<', '(', '@VP',
|
||
|
'=', 'v', '<', '(', '@VB', '$..', '@OBJ', ')',
|
||
|
')'])
|
||
|
|
||
|
def test_tokenize_segmented_patterns(self):
|
||
|
'''Test tokenization of segmented patterns.'''
|
||
|
self.assertEqual(tgrep.tgrep_tokenize(
|
||
|
'S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'),
|
||
|
['S', '<', '@SBJ', '=', 's', '<', '(', '@VP',
|
||
|
'=', 'v', '<', '(', '@VB', '$..', '@OBJ', ')',
|
||
|
')', ':', '=s', '..', '=v'])
|
||
|
|
||
|
def test_labeled_nodes(self):
|
||
|
'''
|
||
|
Test labeled nodes.
|
||
|
|
||
|
Test case from Emily M. Bender.
|
||
|
'''
|
||
|
search = '''
|
||
|
# macros
|
||
|
@ SBJ /SBJ/;
|
||
|
@ VP /VP/;
|
||
|
@ VB /VB/;
|
||
|
@ VPoB /V[PB]/;
|
||
|
@ OBJ /OBJ/;
|
||
|
|
||
|
# 1 svo
|
||
|
S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'''
|
||
|
sent1 = ParentedTree.fromstring(
|
||
|
'(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))')
|
||
|
sent2 = ParentedTree.fromstring(
|
||
|
'(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))')
|
||
|
search_firsthalf = (search.split('\n\n')[0] +
|
||
|
'S < @SBJ < (@VP < (@VB $.. @OBJ))')
|
||
|
search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))'
|
||
|
|
||
|
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
|
||
|
self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
|
||
|
self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions(search, [sent1])),
|
||
|
list(tgrep.tgrep_positions(search_rewrite, [sent1])))
|
||
|
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
|
||
|
self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
|
||
|
self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions(search, [sent2])),
|
||
|
list(tgrep.tgrep_positions(search_rewrite, [sent2])))
|
||
|
|
||
|
def test_multiple_conjs(self):
|
||
|
'''
|
||
|
Test that multiple (3 or more) conjunctions of node relations are
|
||
|
handled properly.
|
||
|
'''
|
||
|
sent = ParentedTree.fromstring(
|
||
|
'((A (B b) (C c)) (A (B b) (C c) (D d)))')
|
||
|
# search = '(A < B < C < D)'
|
||
|
# search_tworels = '(A < B < C)'
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('(A < B < C < D)', [sent])),
|
||
|
[[(1,)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('(A < B < C)', [sent])),
|
||
|
[[(0,), (1,)]])
|
||
|
|
||
|
def test_trailing_semicolon(self):
|
||
|
'''
|
||
|
Test that semicolons at the end of a tgrep2 search string won't
|
||
|
cause a parse failure.
|
||
|
'''
|
||
|
tree = ParentedTree.fromstring(
|
||
|
'(S (NP (DT the) (JJ big) (NN dog)) '
|
||
|
'(VP bit) (NP (DT a) (NN cat)))')
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])),
|
||
|
[[(0,2), (2,1)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])),
|
||
|
[[(0,2), (2,1)]])
|
||
|
self.assertEqual(list(tgrep.tgrep_positions('NN;;', [tree])),
|
||
|
[[(0,2), (2,1)]])
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
unittest.main()
|