781 lines
24 KiB
Python
781 lines
24 KiB
Python
|
# -*- coding: utf-8 -*-
|
|||
|
# Natural Language Toolkit: Interface to the CoreNLP REST API.
|
|||
|
#
|
|||
|
# Copyright (C) 2001-2016 NLTK Project
|
|||
|
# Author: Dmitrijs Milajevs <dimazest@gmail.com>
|
|||
|
#
|
|||
|
# URL: <http://nltk.org/>
|
|||
|
# For license information, see LICENSE.TXT
|
|||
|
|
|||
|
from __future__ import unicode_literals
|
|||
|
|
|||
|
import re
|
|||
|
import json
|
|||
|
import time
|
|||
|
import socket
|
|||
|
|
|||
|
from nltk.internals import find_jar_iter, config_java, java, _java_options
|
|||
|
|
|||
|
from nltk.tag.api import TaggerI
|
|||
|
from nltk.parse.api import ParserI
|
|||
|
from nltk.tokenize.api import TokenizerI
|
|||
|
from nltk.parse.dependencygraph import DependencyGraph
|
|||
|
from nltk.tree import Tree
|
|||
|
|
|||
|
_stanford_url = 'http://stanfordnlp.github.io/CoreNLP/'
|
|||
|
|
|||
|
|
|||
|
class CoreNLPServerError(EnvironmentError):
|
|||
|
"""Exceptions associated with the Core NLP server."""
|
|||
|
|
|||
|
|
|||
|
def try_port(port=0):
|
|||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|||
|
sock.bind(('', port))
|
|||
|
|
|||
|
p = sock.getsockname()[1]
|
|||
|
sock.close()
|
|||
|
|
|||
|
return p
|
|||
|
|
|||
|
|
|||
|
class CoreNLPServer(object):
|
|||
|
|
|||
|
_MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar'
|
|||
|
_JAR = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar'
|
|||
|
|
|||
|
def __init__(
|
|||
|
self, path_to_jar=None, path_to_models_jar=None, verbose=False,
|
|||
|
java_options=None, corenlp_options=None, port=None,
|
|||
|
):
|
|||
|
|
|||
|
if corenlp_options is None:
|
|||
|
corenlp_options = [
|
|||
|
'-preload', 'tokenize,ssplit,pos,lemma,parse,depparse',
|
|||
|
]
|
|||
|
|
|||
|
jars = list(find_jar_iter(
|
|||
|
self._JAR,
|
|||
|
path_to_jar,
|
|||
|
env_vars=('CORENLP', ),
|
|||
|
searchpath=(),
|
|||
|
url=_stanford_url,
|
|||
|
verbose=verbose,
|
|||
|
is_regex=True,
|
|||
|
))
|
|||
|
|
|||
|
# find the most recent code and model jar
|
|||
|
stanford_jar = max(
|
|||
|
jars,
|
|||
|
key=lambda model_name: re.match(self._JAR, model_name)
|
|||
|
)
|
|||
|
|
|||
|
if port is None:
|
|||
|
try:
|
|||
|
port = try_port(9000)
|
|||
|
except socket.error:
|
|||
|
port = try_port()
|
|||
|
corenlp_options.append(str(port))
|
|||
|
else:
|
|||
|
try_port(port)
|
|||
|
|
|||
|
self.url = 'http://localhost:{}'.format(port)
|
|||
|
|
|||
|
model_jar = max(
|
|||
|
find_jar_iter(
|
|||
|
self._MODEL_JAR_PATTERN,
|
|||
|
path_to_models_jar,
|
|||
|
env_vars=('CORENLP_MODELS', ),
|
|||
|
searchpath=(),
|
|||
|
url=_stanford_url,
|
|||
|
verbose=verbose,
|
|||
|
is_regex=True,
|
|||
|
),
|
|||
|
key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name)
|
|||
|
)
|
|||
|
|
|||
|
self.verbose = verbose
|
|||
|
|
|||
|
self._classpath = stanford_jar, model_jar
|
|||
|
|
|||
|
self.corenlp_options = corenlp_options
|
|||
|
self.java_options = java_options or ['-mx2g']
|
|||
|
|
|||
|
def start(self):
|
|||
|
import requests
|
|||
|
|
|||
|
cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer']
|
|||
|
|
|||
|
if self.corenlp_options:
|
|||
|
cmd.extend(self.corenlp_options)
|
|||
|
|
|||
|
# Configure java.
|
|||
|
default_options = ' '.join(_java_options)
|
|||
|
config_java(options=self.java_options, verbose=self.verbose)
|
|||
|
|
|||
|
try:
|
|||
|
# TODO: it's probably a bad idea to pipe stdout, as it will
|
|||
|
# accumulate when lots of text is being parsed.
|
|||
|
self.popen = java(
|
|||
|
cmd,
|
|||
|
classpath=self._classpath,
|
|||
|
blocking=False,
|
|||
|
stdout='pipe',
|
|||
|
stderr='pipe',
|
|||
|
)
|
|||
|
finally:
|
|||
|
# Return java configurations to their default values.
|
|||
|
config_java(options=default_options, verbose=self.verbose)
|
|||
|
|
|||
|
# Check that the server is istill running.
|
|||
|
returncode = self.popen.poll()
|
|||
|
if returncode is not None:
|
|||
|
_, stderrdata = self.popen.communicate()
|
|||
|
raise CoreNLPServerError(
|
|||
|
returncode,
|
|||
|
'Could not start the server. '
|
|||
|
'The error was: {}'.format(stderrdata.decode('ascii'))
|
|||
|
)
|
|||
|
|
|||
|
for i in range(30):
|
|||
|
try:
|
|||
|
response = requests.get(requests.compat.urljoin(self.url, 'live'))
|
|||
|
except requests.exceptions.ConnectionError:
|
|||
|
time.sleep(1)
|
|||
|
else:
|
|||
|
if response.ok:
|
|||
|
break
|
|||
|
else:
|
|||
|
raise CoreNLPServerError(
|
|||
|
'Could not connect to the server.'
|
|||
|
)
|
|||
|
|
|||
|
for i in range(60):
|
|||
|
try:
|
|||
|
response = requests.get(requests.compat.urljoin(self.url, 'ready'))
|
|||
|
except requests.exceptions.ConnectionError:
|
|||
|
time.sleep(1)
|
|||
|
else:
|
|||
|
if response.ok:
|
|||
|
break
|
|||
|
else:
|
|||
|
raise CoreNLPServerError(
|
|||
|
'The server is not ready.'
|
|||
|
)
|
|||
|
|
|||
|
def stop(self):
|
|||
|
self.popen.terminate()
|
|||
|
self.popen.wait()
|
|||
|
|
|||
|
def __enter__(self):
|
|||
|
self.start()
|
|||
|
|
|||
|
return self
|
|||
|
|
|||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|||
|
self.stop()
|
|||
|
return False
|
|||
|
|
|||
|
|
|||
|
class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI):
|
|||
|
"""Interface to the CoreNLP Parser."""
|
|||
|
|
|||
|
def __init__(self, url='http://localhost:9000', encoding='utf8', tagtype=None):
|
|||
|
import requests
|
|||
|
|
|||
|
self.url = url
|
|||
|
self.encoding = encoding
|
|||
|
|
|||
|
if tagtype not in ['pos', 'ner', None]:
|
|||
|
raise ValueError("tagtype must be either 'pos', 'ner' or None")
|
|||
|
|
|||
|
self.tagtype = tagtype
|
|||
|
|
|||
|
self.session = requests.Session()
|
|||
|
|
|||
|
def parse_sents(self, sentences, *args, **kwargs):
|
|||
|
"""Parse multiple sentences.
|
|||
|
|
|||
|
Takes multiple sentences as a list where each sentence is a list of
|
|||
|
words. Each sentence will be automatically tagged with this
|
|||
|
CoreNLPParser instance's tagger.
|
|||
|
|
|||
|
If a whitespace exists inside a token, then the token will be treated as
|
|||
|
several tokens.
|
|||
|
|
|||
|
:param sentences: Input sentences to parse
|
|||
|
:type sentences: list(list(str))
|
|||
|
:rtype: iter(iter(Tree))
|
|||
|
"""
|
|||
|
# Converting list(list(str)) -> list(str)
|
|||
|
sentences = (' '.join(words) for words in sentences)
|
|||
|
return self.raw_parse_sents(sentences, *args, **kwargs)
|
|||
|
|
|||
|
def raw_parse(self, sentence, properties=None, *args, **kwargs):
|
|||
|
"""Parse a sentence.
|
|||
|
|
|||
|
Takes a sentence as a string; before parsing, it will be automatically
|
|||
|
tokenized and tagged by the CoreNLP Parser.
|
|||
|
|
|||
|
:param sentence: Input sentence to parse
|
|||
|
:type sentence: str
|
|||
|
:rtype: iter(Tree)
|
|||
|
"""
|
|||
|
default_properties = {
|
|||
|
'tokenize.whitespace': 'false',
|
|||
|
}
|
|||
|
default_properties.update(properties or {})
|
|||
|
|
|||
|
return next(
|
|||
|
self.raw_parse_sents(
|
|||
|
[sentence],
|
|||
|
properties=default_properties,
|
|||
|
*args,
|
|||
|
**kwargs
|
|||
|
)
|
|||
|
)
|
|||
|
|
|||
|
def api_call(self, data, properties=None):
|
|||
|
default_properties = {
|
|||
|
'outputFormat': 'json',
|
|||
|
'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format(
|
|||
|
parser_annotator=self.parser_annotator,
|
|||
|
),
|
|||
|
}
|
|||
|
|
|||
|
default_properties.update(properties or {})
|
|||
|
|
|||
|
response = self.session.post(
|
|||
|
self.url,
|
|||
|
params={
|
|||
|
'properties': json.dumps(default_properties),
|
|||
|
},
|
|||
|
data=data.encode(self.encoding),
|
|||
|
timeout=60,
|
|||
|
)
|
|||
|
|
|||
|
response.raise_for_status()
|
|||
|
|
|||
|
return response.json()
|
|||
|
|
|||
|
def raw_parse_sents(
|
|||
|
self,
|
|||
|
sentences,
|
|||
|
verbose=False,
|
|||
|
properties=None,
|
|||
|
*args,
|
|||
|
**kwargs
|
|||
|
):
|
|||
|
"""Parse multiple sentences.
|
|||
|
|
|||
|
Takes multiple sentences as a list of strings. Each sentence will be
|
|||
|
automatically tokenized and tagged.
|
|||
|
|
|||
|
:param sentences: Input sentences to parse.
|
|||
|
:type sentences: list(str)
|
|||
|
:rtype: iter(iter(Tree))
|
|||
|
|
|||
|
"""
|
|||
|
default_properties = {
|
|||
|
# Only splits on '\n', never inside the sentence.
|
|||
|
'ssplit.ssplit.eolonly': 'true',
|
|||
|
}
|
|||
|
|
|||
|
default_properties.update(properties or {})
|
|||
|
|
|||
|
"""
|
|||
|
for sentence in sentences:
|
|||
|
parsed_data = self.api_call(sentence, properties=default_properties)
|
|||
|
|
|||
|
assert len(parsed_data['sentences']) == 1
|
|||
|
|
|||
|
for parse in parsed_data['sentences']:
|
|||
|
tree = self.make_tree(parse)
|
|||
|
yield iter([tree])
|
|||
|
"""
|
|||
|
parsed_data = self.api_call('\n'.join(sentences), properties=default_properties)
|
|||
|
for parsed_sent in parsed_data['sentences']:
|
|||
|
tree = self.make_tree(parsed_sent)
|
|||
|
yield iter([tree])
|
|||
|
|
|||
|
|
|||
|
def parse_text(self, text, *args, **kwargs):
|
|||
|
"""Parse a piece of text.
|
|||
|
|
|||
|
The text might contain several sentences which will be split by CoreNLP.
|
|||
|
|
|||
|
:param str text: text to be split.
|
|||
|
:returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables?
|
|||
|
|
|||
|
"""
|
|||
|
parsed_data = self.api_call(text, *args, **kwargs)
|
|||
|
|
|||
|
for parse in parsed_data['sentences']:
|
|||
|
yield self.make_tree(parse)
|
|||
|
|
|||
|
def tokenize(self, text, properties=None):
|
|||
|
"""Tokenize a string of text.
|
|||
|
|
|||
|
>>> parser = CoreNLPParser(url='http://localhost:9000')
|
|||
|
|
|||
|
>>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'
|
|||
|
>>> list(parser.tokenize(text))
|
|||
|
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
|||
|
|
|||
|
>>> s = "The colour of the wall is blue."
|
|||
|
>>> list(
|
|||
|
... parser.tokenize(
|
|||
|
... 'The colour of the wall is blue.',
|
|||
|
... properties={'tokenize.options': 'americanize=true'},
|
|||
|
... )
|
|||
|
... )
|
|||
|
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
|
|||
|
|
|||
|
"""
|
|||
|
default_properties = {
|
|||
|
'annotators': 'tokenize,ssplit',
|
|||
|
|
|||
|
}
|
|||
|
|
|||
|
default_properties.update(properties or {})
|
|||
|
|
|||
|
result = self.api_call(text, properties=default_properties)
|
|||
|
|
|||
|
for sentence in result['sentences']:
|
|||
|
for token in sentence['tokens']:
|
|||
|
yield token['originalText'] or token['word']
|
|||
|
|
|||
|
def tag_sents(self, sentences):
|
|||
|
"""
|
|||
|
Tag multiple sentences.
|
|||
|
|
|||
|
Takes multiple sentences as a list where each sentence is a list of
|
|||
|
tokens.
|
|||
|
|
|||
|
:param sentences: Input sentences to tag
|
|||
|
:type sentences: list(list(str))
|
|||
|
:rtype: list(list(tuple(str, str))
|
|||
|
"""
|
|||
|
# Converting list(list(str)) -> list(str)
|
|||
|
sentences = (' '.join(words) for words in sentences)
|
|||
|
return [sentences[0] for sentences in self.raw_tag_sents(sentences)]
|
|||
|
|
|||
|
def tag(self, sentence):
|
|||
|
"""
|
|||
|
Tag a list of tokens.
|
|||
|
|
|||
|
:rtype: list(tuple(str, str))
|
|||
|
|
|||
|
>>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
|
|||
|
>>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
|
|||
|
>>> parser.tag(tokens)
|
|||
|
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'),
|
|||
|
('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]
|
|||
|
|
|||
|
>>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
|
|||
|
>>> tokens = "What is the airspeed of an unladen swallow ?".split()
|
|||
|
>>> parser.tag(tokens)
|
|||
|
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
|
|||
|
('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
|
|||
|
('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
|
|||
|
"""
|
|||
|
return self.tag_sents([sentence])[0]
|
|||
|
|
|||
|
def raw_tag_sents(self, sentences):
|
|||
|
"""
|
|||
|
Tag multiple sentences.
|
|||
|
|
|||
|
Takes multiple sentences as a list where each sentence is a string.
|
|||
|
|
|||
|
:param sentences: Input sentences to tag
|
|||
|
:type sentences: list(str)
|
|||
|
:rtype: list(list(list(tuple(str, str)))
|
|||
|
"""
|
|||
|
default_properties = {'ssplit.isOneSentence': 'true',
|
|||
|
'annotators': 'tokenize,ssplit,' }
|
|||
|
|
|||
|
# Supports only 'pos' or 'ner' tags.
|
|||
|
assert self.tagtype in ['pos', 'ner']
|
|||
|
default_properties['annotators'] += self.tagtype
|
|||
|
for sentence in sentences:
|
|||
|
tagged_data = self.api_call(sentence, properties=default_properties)
|
|||
|
yield [[(token['word'], token[self.tagtype]) for token in tagged_sentence['tokens']]
|
|||
|
for tagged_sentence in tagged_data['sentences']]
|
|||
|
|
|||
|
class CoreNLPParser(GenericCoreNLPParser):
|
|||
|
"""
|
|||
|
>>> parser = CoreNLPParser(url='http://localhost:9000')
|
|||
|
|
|||
|
>>> next(
|
|||
|
... parser.raw_parse('The quick brown fox jumps over the lazy dog.')
|
|||
|
... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
|||
|
ROOT
|
|||
|
|
|
|||
|
S
|
|||
|
_______________|__________________________
|
|||
|
| VP |
|
|||
|
| _________|___ |
|
|||
|
| | PP |
|
|||
|
| | ________|___ |
|
|||
|
NP | | NP |
|
|||
|
____|__________ | | _______|____ |
|
|||
|
DT JJ JJ NN VBZ IN DT JJ NN .
|
|||
|
| | | | | | | | | |
|
|||
|
The quick brown fox jumps over the lazy dog .
|
|||
|
|
|||
|
>>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents(
|
|||
|
... [
|
|||
|
... 'The quick brown fox jumps over the lazy dog.',
|
|||
|
... 'The quick grey wolf jumps over the lazy fox.',
|
|||
|
... ]
|
|||
|
... )
|
|||
|
|
|||
|
>>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
|||
|
ROOT
|
|||
|
|
|
|||
|
S
|
|||
|
_______________|__________________________
|
|||
|
| VP |
|
|||
|
| _________|___ |
|
|||
|
| | PP |
|
|||
|
| | ________|___ |
|
|||
|
NP | | NP |
|
|||
|
____|__________ | | _______|____ |
|
|||
|
DT JJ JJ NN VBZ IN DT JJ NN .
|
|||
|
| | | | | | | | | |
|
|||
|
The quick brown fox jumps over the lazy dog .
|
|||
|
|
|||
|
>>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
|||
|
ROOT
|
|||
|
|
|
|||
|
S
|
|||
|
_______________|__________________________
|
|||
|
| VP |
|
|||
|
| _________|___ |
|
|||
|
| | PP |
|
|||
|
| | ________|___ |
|
|||
|
NP | | NP |
|
|||
|
____|_________ | | _______|____ |
|
|||
|
DT JJ JJ NN VBZ IN DT JJ NN .
|
|||
|
| | | | | | | | | |
|
|||
|
The quick grey wolf jumps over the lazy fox .
|
|||
|
|
|||
|
>>> (parse_dog, ), (parse_friends, ) = parser.parse_sents(
|
|||
|
... [
|
|||
|
... "I 'm a dog".split(),
|
|||
|
... "This is my friends ' cat ( the tabby )".split(),
|
|||
|
... ]
|
|||
|
... )
|
|||
|
|
|||
|
>>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
|||
|
ROOT
|
|||
|
|
|
|||
|
S
|
|||
|
_______|____
|
|||
|
| VP
|
|||
|
| ________|___
|
|||
|
NP | NP
|
|||
|
| | ___|___
|
|||
|
PRP VBP DT NN
|
|||
|
| | | |
|
|||
|
I 'm a dog
|
|||
|
|
|||
|
>>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
|||
|
ROOT
|
|||
|
|
|
|||
|
S
|
|||
|
____|___________
|
|||
|
| VP
|
|||
|
| ___________|_____________
|
|||
|
| | NP
|
|||
|
| | _______|_________
|
|||
|
| | NP PRN
|
|||
|
| | _____|_______ ____|______________
|
|||
|
NP | NP | | NP |
|
|||
|
| | ______|_________ | | ___|____ |
|
|||
|
DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB-
|
|||
|
| | | | | | | | | |
|
|||
|
This is my friends ' cat -LRB- the tabby -RRB-
|
|||
|
|
|||
|
>>> parse_john, parse_mary, = parser.parse_text(
|
|||
|
... 'John loves Mary. Mary walks.'
|
|||
|
... )
|
|||
|
|
|||
|
>>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
|||
|
ROOT
|
|||
|
|
|
|||
|
S
|
|||
|
_____|_____________
|
|||
|
| VP |
|
|||
|
| ____|___ |
|
|||
|
NP | NP |
|
|||
|
| | | |
|
|||
|
NNP VBZ NNP .
|
|||
|
| | | |
|
|||
|
John loves Mary .
|
|||
|
|
|||
|
>>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE
|
|||
|
ROOT
|
|||
|
|
|
|||
|
S
|
|||
|
_____|____
|
|||
|
NP VP |
|
|||
|
| | |
|
|||
|
NNP VBZ .
|
|||
|
| | |
|
|||
|
Mary walks .
|
|||
|
|
|||
|
Special cases
|
|||
|
-------------
|
|||
|
|
|||
|
>>> next(
|
|||
|
... parser.raw_parse(
|
|||
|
... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war '
|
|||
|
... 'Jessica Lynch have angrily dismissed claims made in her biography '
|
|||
|
... 'that she was raped by her Iraqi captors.'
|
|||
|
... )
|
|||
|
... ).height()
|
|||
|
20
|
|||
|
|
|||
|
>>> next(
|
|||
|
... parser.raw_parse(
|
|||
|
... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or "
|
|||
|
... '0.05 percent, at 997.02.'
|
|||
|
... )
|
|||
|
... ).height()
|
|||
|
9
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
_OUTPUT_FORMAT = 'penn'
|
|||
|
parser_annotator = 'parse'
|
|||
|
|
|||
|
def make_tree(self, result):
|
|||
|
return Tree.fromstring(result['parse'])
|
|||
|
|
|||
|
|
|||
|
class CoreNLPDependencyParser(GenericCoreNLPParser):
|
|||
|
"""Dependency parser.
|
|||
|
|
|||
|
>>> dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
|
|||
|
|
|||
|
>>> parse, = dep_parser.raw_parse(
|
|||
|
... 'The quick brown fox jumps over the lazy dog.'
|
|||
|
... )
|
|||
|
>>> print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
|||
|
The DT 4 det
|
|||
|
quick JJ 4 amod
|
|||
|
brown JJ 4 amod
|
|||
|
fox NN 5 nsubj
|
|||
|
jumps VBZ 0 ROOT
|
|||
|
over IN 9 case
|
|||
|
the DT 9 det
|
|||
|
lazy JJ 9 amod
|
|||
|
dog NN 5 nmod
|
|||
|
. . 5 punct
|
|||
|
|
|||
|
>>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE
|
|||
|
(jumps (fox The quick brown) (dog over the lazy) .)
|
|||
|
|
|||
|
>>> for governor, dep, dependent in parse.triples():
|
|||
|
... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE
|
|||
|
('jumps', 'VBZ') nsubj ('fox', 'NN')
|
|||
|
('fox', 'NN') det ('The', 'DT')
|
|||
|
('fox', 'NN') amod ('quick', 'JJ')
|
|||
|
('fox', 'NN') amod ('brown', 'JJ')
|
|||
|
('jumps', 'VBZ') nmod ('dog', 'NN')
|
|||
|
('dog', 'NN') case ('over', 'IN')
|
|||
|
('dog', 'NN') det ('the', 'DT')
|
|||
|
('dog', 'NN') amod ('lazy', 'JJ')
|
|||
|
('jumps', 'VBZ') punct ('.', '.')
|
|||
|
|
|||
|
>>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents(
|
|||
|
... [
|
|||
|
... 'The quick brown fox jumps over the lazy dog.',
|
|||
|
... 'The quick grey wolf jumps over the lazy fox.',
|
|||
|
... ]
|
|||
|
... )
|
|||
|
>>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
|||
|
The DT 4 det
|
|||
|
quick JJ 4 amod
|
|||
|
brown JJ 4 amod
|
|||
|
fox NN 5 nsubj
|
|||
|
jumps VBZ 0 ROOT
|
|||
|
over IN 9 case
|
|||
|
the DT 9 det
|
|||
|
lazy JJ 9 amod
|
|||
|
dog NN 5 nmod
|
|||
|
. . 5 punct
|
|||
|
|
|||
|
>>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
|||
|
The DT 4 det
|
|||
|
quick JJ 4 amod
|
|||
|
grey JJ 4 amod
|
|||
|
wolf NN 5 nsubj
|
|||
|
jumps VBZ 0 ROOT
|
|||
|
over IN 9 case
|
|||
|
the DT 9 det
|
|||
|
lazy JJ 9 amod
|
|||
|
fox NN 5 nmod
|
|||
|
. . 5 punct
|
|||
|
|
|||
|
>>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents(
|
|||
|
... [
|
|||
|
... "I 'm a dog".split(),
|
|||
|
... "This is my friends ' cat ( the tabby )".split(),
|
|||
|
... ]
|
|||
|
... )
|
|||
|
>>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
|||
|
I PRP 4 nsubj
|
|||
|
'm VBP 4 cop
|
|||
|
a DT 4 det
|
|||
|
dog NN 0 ROOT
|
|||
|
|
|||
|
>>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
|||
|
This DT 6 nsubj
|
|||
|
is VBZ 6 cop
|
|||
|
my PRP$ 4 nmod:poss
|
|||
|
friends NNS 6 nmod:poss
|
|||
|
' POS 4 case
|
|||
|
cat NN 0 ROOT
|
|||
|
-LRB- -LRB- 9 punct
|
|||
|
the DT 9 det
|
|||
|
tabby NN 6 appos
|
|||
|
-RRB- -RRB- 9 punct
|
|||
|
|
|||
|
>>> parse_john, parse_mary, = dep_parser.parse_text(
|
|||
|
... 'John loves Mary. Mary walks.'
|
|||
|
... )
|
|||
|
|
|||
|
>>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
|||
|
John NNP 2 nsubj
|
|||
|
loves VBZ 0 ROOT
|
|||
|
Mary NNP 2 dobj
|
|||
|
. . 2 punct
|
|||
|
|
|||
|
>>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
|
|||
|
Mary NNP 2 nsubj
|
|||
|
walks VBZ 0 ROOT
|
|||
|
. . 2 punct
|
|||
|
|
|||
|
Special cases
|
|||
|
-------------
|
|||
|
|
|||
|
Non-breaking space inside of a token.
|
|||
|
|
|||
|
>>> len(
|
|||
|
... next(
|
|||
|
... dep_parser.raw_parse(
|
|||
|
... 'Anhalt said children typically treat a 20-ounce soda bottle as one '
|
|||
|
... 'serving, while it actually contains 2 1/2 servings.'
|
|||
|
... )
|
|||
|
... ).nodes
|
|||
|
... )
|
|||
|
21
|
|||
|
|
|||
|
Phone numbers.
|
|||
|
|
|||
|
>>> len(
|
|||
|
... next(
|
|||
|
... dep_parser.raw_parse('This is not going to crash: 01 111 555.')
|
|||
|
... ).nodes
|
|||
|
... )
|
|||
|
10
|
|||
|
|
|||
|
>>> print(
|
|||
|
... next(
|
|||
|
... dep_parser.raw_parse('The underscore _ should not simply disappear.')
|
|||
|
... ).to_conll(4)
|
|||
|
... ) # doctest: +NORMALIZE_WHITESPACE
|
|||
|
The DT 3 det
|
|||
|
underscore VBP 3 amod
|
|||
|
_ NN 7 nsubj
|
|||
|
should MD 7 aux
|
|||
|
not RB 7 neg
|
|||
|
simply RB 7 advmod
|
|||
|
disappear VB 0 ROOT
|
|||
|
. . 7 punct
|
|||
|
|
|||
|
>>> print(
|
|||
|
... '\\n'.join(
|
|||
|
... next(
|
|||
|
... dep_parser.raw_parse(
|
|||
|
... 'for all of its insights into the dream world of teen life , and its electronic expression through '
|
|||
|
... 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 '
|
|||
|
... '1/2-hour running time .'
|
|||
|
... )
|
|||
|
... ).to_conll(4).split('\\n')[-8:]
|
|||
|
... )
|
|||
|
... )
|
|||
|
its PRP$ 40 nmod:poss
|
|||
|
2 1/2 CD 40 nummod
|
|||
|
- : 40 punct
|
|||
|
hour NN 31 nmod
|
|||
|
running VBG 42 amod
|
|||
|
time NN 40 dep
|
|||
|
. . 24 punct
|
|||
|
<BLANKLINE>
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
_OUTPUT_FORMAT = 'conll2007'
|
|||
|
parser_annotator = 'depparse'
|
|||
|
|
|||
|
def make_tree(self, result):
|
|||
|
|
|||
|
return DependencyGraph(
|
|||
|
(
|
|||
|
' '.join(n_items[1:]) # NLTK expects an iterable of strings...
|
|||
|
for n_items in sorted(transform(result))
|
|||
|
),
|
|||
|
cell_separator=' ', # To make sure that a non-breaking space is kept inside of a token.
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
def transform(sentence):
|
|||
|
for dependency in sentence['basicDependencies']:
|
|||
|
|
|||
|
dependent_index = dependency['dependent']
|
|||
|
token = sentence['tokens'][dependent_index - 1]
|
|||
|
|
|||
|
# Return values that we don't know as '_'. Also, consider tag and ctag
|
|||
|
# to be equal.
|
|||
|
yield (
|
|||
|
dependent_index,
|
|||
|
'_',
|
|||
|
token['word'],
|
|||
|
token['lemma'],
|
|||
|
token['pos'],
|
|||
|
token['pos'],
|
|||
|
'_',
|
|||
|
str(dependency['governor']),
|
|||
|
dependency['dep'],
|
|||
|
'_',
|
|||
|
'_',
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
def setup_module(module):
|
|||
|
from nose import SkipTest
|
|||
|
raise SkipTest('Skipping all CoreNLP tests.')
|
|||
|
global server
|
|||
|
|
|||
|
try:
|
|||
|
server = CoreNLPServer(port=9000)
|
|||
|
except LookupError as e:
|
|||
|
raise SkipTest('Could not instantiate CoreNLPServer.')
|
|||
|
|
|||
|
try:
|
|||
|
server.start()
|
|||
|
except CoreNLPServerError as e:
|
|||
|
raise SkipTest(
|
|||
|
'Skipping CoreNLP tests because the server could not be started. '
|
|||
|
'Make sure that the 9000 port is free. '
|
|||
|
'{}'.format(e.strerror)
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
def teardown_module(module):
|
|||
|
return
|
|||
|
server.stop()
|