# -*- coding: utf-8 -*- # Natural Language Toolkit: Interface to the Stanford Parser # # Copyright (C) 2001-2018 NLTK Project # Author: Steven Xu # # URL: # For license information, see LICENSE.TXT from __future__ import unicode_literals import tempfile import os import re import warnings from subprocess import PIPE from io import StringIO from six import text_type from nltk.internals import find_jar, find_jar_iter, config_java, java, _java_options, find_jars_within_path from nltk.parse.api import ParserI from nltk.parse.dependencygraph import DependencyGraph from nltk.tree import Tree _stanford_url = 'https://nlp.stanford.edu/software/lex-parser.shtml' class GenericStanfordParser(ParserI): """Interface to the Stanford Parser""" _MODEL_JAR_PATTERN = r'stanford-parser-(\d+)(\.(\d+))+-models\.jar' _JAR = r'stanford-parser\.jar' _MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' _USE_STDIN = False _DOUBLE_SPACED_OUTPUT = False def __init__(self, path_to_jar=None, path_to_models_jar=None, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8', verbose=False, java_options='-mx1000m', corenlp_options=''): # find the most recent code and model jar stanford_jar = max( find_jar_iter( self._JAR, path_to_jar, env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True ), key=lambda model_path: os.path.dirname(model_path) ) model_jar = max( find_jar_iter( self._MODEL_JAR_PATTERN, path_to_models_jar, env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True ), key=lambda model_path: os.path.dirname(model_path) ) #self._classpath = (stanford_jar, model_jar) # Adding logging jar files to classpath stanford_dir = os.path.split(stanford_jar)[0] self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir)) self.model_path = model_path self._encoding = encoding self.corenlp_options = corenlp_options self.java_options = java_options def _parse_trees_output(self, output_): res = [] cur_lines = [] cur_trees = [] blank = False for line in output_.splitlines(False): if line == '': if blank: res.append(iter(cur_trees)) cur_trees = [] blank = False elif self._DOUBLE_SPACED_OUTPUT: cur_trees.append(self._make_tree('\n'.join(cur_lines))) cur_lines = [] blank = True else: res.append(iter([self._make_tree('\n'.join(cur_lines))])) cur_lines = [] else: cur_lines.append(line) blank = False return iter(res) def parse_sents(self, sentences, verbose=False): """ Use StanfordParser to parse multiple sentences. Takes multiple sentences as a list where each sentence is a list of words. Each sentence will be automatically tagged with this StanfordParser instance's tagger. If whitespaces exists inside a token, then the token will be treated as separate tokens. :param sentences: Input sentences to parse :type sentences: list(list(str)) :rtype: iter(iter(Tree)) """ cmd = [ self._MAIN_CLASS, '-model', self.model_path, '-sentences', 'newline', '-outputFormat', self._OUTPUT_FORMAT, '-tokenized', '-escaper', 'edu.stanford.nlp.process.PTBEscapingProcessor', ] return self._parse_trees_output(self._execute( cmd, '\n'.join(' '.join(sentence) for sentence in sentences), verbose)) def raw_parse(self, sentence, verbose=False): """ Use StanfordParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged by the Stanford Parser. :param sentence: Input sentence to parse :type sentence: str :rtype: iter(Tree) """ return next(self.raw_parse_sents([sentence], verbose)) def raw_parse_sents(self, sentences, verbose=False): """ Use StanfordParser to parse multiple sentences. Takes multiple sentences as a list of strings. Each sentence will be automatically tokenized and tagged by the Stanford Parser. :param sentences: Input sentences to parse :type sentences: list(str) :rtype: iter(iter(Tree)) """ cmd = [ self._MAIN_CLASS, '-model', self.model_path, '-sentences', 'newline', '-outputFormat', self._OUTPUT_FORMAT, ] return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), verbose)) def tagged_parse(self, sentence, verbose=False): """ Use StanfordParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: list(tuple(str, str)) :rtype: iter(Tree) """ return next(self.tagged_parse_sents([sentence], verbose)) def tagged_parse_sents(self, sentences, verbose=False): """ Use StanfordParser to parse multiple sentences. Takes multiple sentences where each sentence is a list of (word, tag) tuples. The sentences must have already been tokenized and tagged. :param sentences: Input sentences to parse :type sentences: list(list(tuple(str, str))) :rtype: iter(iter(Tree)) """ tag_separator = '/' cmd = [ self._MAIN_CLASS, '-model', self.model_path, '-sentences', 'newline', '-outputFormat', self._OUTPUT_FORMAT, '-tokenized', '-tagSeparator', tag_separator, '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', '-tokenizerMethod', 'newCoreLabelTokenizerFactory', ] # We don't need to escape slashes as "splitting is done on the last instance of the character in the token" return self._parse_trees_output(self._execute( cmd, '\n'.join(' '.join(tag_separator.join(tagged) for tagged in sentence) for sentence in sentences), verbose)) def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(['-encoding', encoding]) if self.corenlp_options: cmd.append(self.corenlp_options) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() # Run the tagger and get the output. if self._USE_STDIN: input_file.seek(0) stdout, stderr = java(cmd, classpath=self._classpath, stdin=input_file, stdout=PIPE, stderr=PIPE) else: cmd.append(input_file.name) stdout, stderr = java(cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE) stdout = stdout.replace(b'\xc2\xa0', b' ') stdout = stdout.replace(b'\x00\xa0', b' ') stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout class StanfordParser(GenericStanfordParser): """ >>> parser=StanfordParser( ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ... ) >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])] >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents(( ... "the quick brown fox jumps over the lazy dog", ... "the quick grey wolf jumps over the lazy fox" ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])] >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents(( ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]), Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']), Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])] >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents(( ... ( ... ("The", "DT"), ... ("quick", "JJ"), ... ("brown", "JJ"), ... ("fox", "NN"), ... ("jumped", "VBD"), ... ("over", "IN"), ... ("the", "DT"), ... ("lazy", "JJ"), ... ("dog", "NN"), ... (".", "."), ... ), ... ))],[]) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])] """ _OUTPUT_FORMAT = 'penn' def __init__(self, *args, **kwargs): warnings.warn("The StanfordParser will be deprecated\n" "Please use \033[91mnltk.parse.corenlp.StanforCoreNLPParser\033[0m instead.", DeprecationWarning, stacklevel=2) super(StanfordParser, self).__init__(*args, **kwargs) def _make_tree(self, result): return Tree.fromstring(result) class StanfordDependencyParser(GenericStanfordParser): """ >>> dep_parser=StanfordDependencyParser( ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ... ) >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])] >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( ... "The quick brown fox jumps over the lazy dog.", ... "The quick grey wolf jumps over the lazy fox." ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])] >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents(( ... ( ... ("The", "DT"), ... ("quick", "JJ"), ... ("brown", "JJ"), ... ("fox", "NN"), ... ("jumped", "VBD"), ... ("over", "IN"), ... ("the", "DT"), ... ("lazy", "JJ"), ... ("dog", "NN"), ... (".", "."), ... ), ... ))],[]) # doctest: +NORMALIZE_WHITESPACE [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] """ _OUTPUT_FORMAT = 'conll2007' def __init__(self, *args, **kwargs): warnings.warn("The StanfordDependencyParser will be deprecated\n" "Please use \033[91mnltk.parse.corenlp.StanforCoreNLPDependencyParser\033[0m instead.", DeprecationWarning, stacklevel=2) super(StanfordDependencyParser, self).__init__(*args, **kwargs) def _make_tree(self, result): return DependencyGraph(result, top_relation_label='root') class StanfordNeuralDependencyParser(GenericStanfordParser): ''' >>> from nltk.parse.stanford import StanfordNeuralDependencyParser >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g') >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])] >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'), u'punct', (u'.', u'.'))]] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( ... "The quick brown fox jumps over the lazy dog.", ... "The quick grey wolf jumps over the lazy fox." ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy']), '.'])] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])] ''' _OUTPUT_FORMAT = 'conll' _MAIN_CLASS = 'edu.stanford.nlp.pipeline.StanfordCoreNLP' _JAR = r'stanford-corenlp-(\d+)(\.(\d+))+\.jar' _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)(\.(\d+))+-models\.jar' _USE_STDIN = True _DOUBLE_SPACED_OUTPUT = True def __init__(self, *args, **kwargs): warnings.warn("The StanfordNeuralDependencyParser will be deprecated\n" "Please use \033[91mnltk.parse.corenlp.StanforCoreNLPNeuralDependencyParser\033[0m instead.", DeprecationWarning, stacklevel=2) super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs) self.corenlp_options += '-annotators tokenize,ssplit,pos,depparse' def tagged_parse_sents(self, sentences, verbose=False): ''' Currently unimplemented because the neural dependency parser (and the StanfordCoreNLP pipeline class) doesn't support passing in pre- tagged tokens. ''' raise NotImplementedError( 'tagged_parse[_sents] is not supported by ' 'StanfordNeuralDependencyParser; use ' 'parse[_sents] or raw_parse[_sents] instead.' ) def _make_tree(self, result): return DependencyGraph(result, top_relation_label='ROOT') def setup_module(module): from nose import SkipTest try: StanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) StanfordNeuralDependencyParser() except LookupError: raise SkipTest('doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn\'t exist')