241 lines
7.3 KiB
Text
Executable file
241 lines
7.3 KiB
Text
Executable file
.. Copyright (C) 2001-2018 NLTK Project
|
|
.. For license information, see LICENSE.TXT
|
|
|
|
===================
|
|
Dependency Grammars
|
|
===================
|
|
|
|
>>> from nltk.grammar import DependencyGrammar
|
|
>>> from nltk.parse import (
|
|
... DependencyGraph,
|
|
... ProjectiveDependencyParser,
|
|
... NonprojectiveDependencyParser,
|
|
... )
|
|
|
|
CoNLL Data
|
|
----------
|
|
|
|
>>> treebank_data = """Pierre NNP 2 NMOD
|
|
... Vinken NNP 8 SUB
|
|
... , , 2 P
|
|
... 61 CD 5 NMOD
|
|
... years NNS 6 AMOD
|
|
... old JJ 2 NMOD
|
|
... , , 2 P
|
|
... will MD 0 ROOT
|
|
... join VB 8 VC
|
|
... the DT 11 NMOD
|
|
... board NN 9 OBJ
|
|
... as IN 9 VMOD
|
|
... a DT 15 NMOD
|
|
... nonexecutive JJ 15 NMOD
|
|
... director NN 12 PMOD
|
|
... Nov. NNP 9 VMOD
|
|
... 29 CD 16 NMOD
|
|
... . . 9 VMOD
|
|
... """
|
|
|
|
>>> dg = DependencyGraph(treebank_data)
|
|
>>> dg.tree().pprint()
|
|
(will
|
|
(Vinken Pierre , (old (years 61)) ,)
|
|
(join (board the) (as (director a nonexecutive)) (Nov. 29) .))
|
|
>>> for head, rel, dep in dg.triples():
|
|
... print(
|
|
... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})'
|
|
... .format(h=head, r=rel, d=dep)
|
|
... )
|
|
(will, MD), SUB, (Vinken, NNP)
|
|
(Vinken, NNP), NMOD, (Pierre, NNP)
|
|
(Vinken, NNP), P, (,, ,)
|
|
(Vinken, NNP), NMOD, (old, JJ)
|
|
(old, JJ), AMOD, (years, NNS)
|
|
(years, NNS), NMOD, (61, CD)
|
|
(Vinken, NNP), P, (,, ,)
|
|
(will, MD), VC, (join, VB)
|
|
(join, VB), OBJ, (board, NN)
|
|
(board, NN), NMOD, (the, DT)
|
|
(join, VB), VMOD, (as, IN)
|
|
(as, IN), PMOD, (director, NN)
|
|
(director, NN), NMOD, (a, DT)
|
|
(director, NN), NMOD, (nonexecutive, JJ)
|
|
(join, VB), VMOD, (Nov., NNP)
|
|
(Nov., NNP), NMOD, (29, CD)
|
|
(join, VB), VMOD, (., .)
|
|
|
|
Using a custom cell extractor.
|
|
|
|
>>> def custom_extractor(cells):
|
|
... _, tag, head, rel = cells
|
|
... return 'spam', 'spam', tag, tag, '', head, rel
|
|
>>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor)
|
|
>>> dg.tree().pprint()
|
|
(spam
|
|
(spam spam spam (spam (spam spam)) spam)
|
|
(spam (spam spam) (spam (spam spam spam)) (spam spam) spam))
|
|
|
|
Custom cell extractors can take in and return an index.
|
|
|
|
>>> def custom_extractor(cells, index):
|
|
... word, tag, head, rel = cells
|
|
... return (index, '{}-{}'.format(word, index), word,
|
|
... tag, tag, '', head, rel)
|
|
>>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor)
|
|
>>> dg.tree().pprint()
|
|
(will-8
|
|
(Vinken-2 Pierre-1 ,-3 (old-6 (years-5 61-4)) ,-7)
|
|
(join-9
|
|
(board-11 the-10)
|
|
(as-12 (director-15 a-13 nonexecutive-14))
|
|
(Nov.-16 29-17)
|
|
.-18))
|
|
|
|
Using the dependency-parsed version of the Penn Treebank corpus sample.
|
|
|
|
>>> from nltk.corpus import dependency_treebank
|
|
>>> t = dependency_treebank.parsed_sents()[0]
|
|
>>> print(t.to_conll(3)) # doctest: +NORMALIZE_WHITESPACE
|
|
Pierre NNP 2
|
|
Vinken NNP 8
|
|
, , 2
|
|
61 CD 5
|
|
years NNS 6
|
|
old JJ 2
|
|
, , 2
|
|
will MD 0
|
|
join VB 8
|
|
the DT 11
|
|
board NN 9
|
|
as IN 9
|
|
a DT 15
|
|
nonexecutive JJ 15
|
|
director NN 12
|
|
Nov. NNP 9
|
|
29 CD 16
|
|
. . 8
|
|
|
|
Using the output of zpar (like Malt-TAB but with zero-based indexing)
|
|
|
|
>>> zpar_data = """
|
|
... Pierre NNP 1 NMOD
|
|
... Vinken NNP 7 SUB
|
|
... , , 1 P
|
|
... 61 CD 4 NMOD
|
|
... years NNS 5 AMOD
|
|
... old JJ 1 NMOD
|
|
... , , 1 P
|
|
... will MD -1 ROOT
|
|
... join VB 7 VC
|
|
... the DT 10 NMOD
|
|
... board NN 8 OBJ
|
|
... as IN 8 VMOD
|
|
... a DT 14 NMOD
|
|
... nonexecutive JJ 14 NMOD
|
|
... director NN 11 PMOD
|
|
... Nov. NNP 8 VMOD
|
|
... 29 CD 15 NMOD
|
|
... . . 7 P
|
|
... """
|
|
|
|
>>> zdg = DependencyGraph(zpar_data, zero_based=True)
|
|
>>> print(zdg.tree())
|
|
(will
|
|
(Vinken Pierre , (old (years 61)) ,)
|
|
(join (board the) (as (director a nonexecutive)) (Nov. 29))
|
|
.)
|
|
|
|
|
|
Projective Dependency Parsing
|
|
-----------------------------
|
|
|
|
>>> grammar = DependencyGrammar.fromstring("""
|
|
... 'fell' -> 'price' | 'stock'
|
|
... 'price' -> 'of' 'the'
|
|
... 'of' -> 'stock'
|
|
... 'stock' -> 'the'
|
|
... """)
|
|
>>> print(grammar)
|
|
Dependency grammar with 5 productions
|
|
'fell' -> 'price'
|
|
'fell' -> 'stock'
|
|
'price' -> 'of' 'the'
|
|
'of' -> 'stock'
|
|
'stock' -> 'the'
|
|
|
|
>>> dp = ProjectiveDependencyParser(grammar)
|
|
>>> for t in sorted(dp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])):
|
|
... print(t)
|
|
(fell (price the (of (stock the))))
|
|
(fell (price the of) (stock the))
|
|
(fell (price the of the) stock)
|
|
|
|
Non-Projective Dependency Parsing
|
|
---------------------------------
|
|
|
|
>>> grammar = DependencyGrammar.fromstring("""
|
|
... 'taught' -> 'play' | 'man'
|
|
... 'man' -> 'the'
|
|
... 'play' -> 'golf' | 'dog' | 'to'
|
|
... 'dog' -> 'his'
|
|
... """)
|
|
>>> print(grammar)
|
|
Dependency grammar with 7 productions
|
|
'taught' -> 'play'
|
|
'taught' -> 'man'
|
|
'man' -> 'the'
|
|
'play' -> 'golf'
|
|
'play' -> 'dog'
|
|
'play' -> 'to'
|
|
'dog' -> 'his'
|
|
|
|
>>> dp = NonprojectiveDependencyParser(grammar)
|
|
>>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf'])
|
|
|
|
>>> print(g.root['word'])
|
|
taught
|
|
|
|
>>> for _, node in sorted(g.nodes.items()):
|
|
... if node['word'] is not None:
|
|
... print('{address} {word}: {d}'.format(d=node['deps'][''], **node))
|
|
1 the: []
|
|
2 man: [1]
|
|
3 taught: [2, 7]
|
|
4 his: []
|
|
5 dog: [4]
|
|
6 to: []
|
|
7 play: [5, 6, 8]
|
|
8 golf: []
|
|
|
|
>>> print(g.tree())
|
|
(taught (man the) (play (dog his) to golf))
|
|
|
|
Integration with MALT parser
|
|
============================
|
|
|
|
In case the top relation is different from the default, we can set it. In case
|
|
of MALT parser, it's set to `'null'`.
|
|
|
|
>>> dg_str = """1 I _ NN NN _ 2 nn _ _
|
|
... 2 shot _ NN NN _ 0 null _ _
|
|
... 3 an _ AT AT _ 2 dep _ _
|
|
... 4 elephant _ NN NN _ 7 nn _ _
|
|
... 5 in _ NN NN _ 7 nn _ _
|
|
... 6 my _ NN NN _ 7 nn _ _
|
|
... 7 pajamas _ NNS NNS _ 3 dobj _ _
|
|
... """
|
|
>>> dg = DependencyGraph(dg_str, top_relation_label='null')
|
|
|
|
>>> len(dg.nodes)
|
|
8
|
|
|
|
>>> dg.root['word'], dg.root['address']
|
|
('shot', 2)
|
|
|
|
>>> print(dg.to_conll(10)) # doctest: +NORMALIZE_WHITESPACE
|
|
1 I _ NN NN _ 2 nn _ _
|
|
2 shot _ NN NN _ 0 null _ _
|
|
3 an _ AT AT _ 2 dep _ _
|
|
4 elephant _ NN NN _ 7 nn _ _
|
|
5 in _ NN NN _ 7 nn _ _
|
|
6 my _ NN NN _ 7 nn _ _
|
|
7 pajamas _ NNS NNS _ 3 dobj _ _
|