laywerrobot/lib/python3.6/site-packages/nltk/classify/weka.py

# Natural Language Toolkit: Interface to Weka Classsifiers
#
# Copyright (C) 2001-2018 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
Classifiers that make use of the external 'Weka' package.
"""
from __future__ import print_function
import time
import tempfile
import os
import subprocess
import re
import zipfile
from sys import stdin

from six import integer_types, string_types

from nltk.probability import DictionaryProbDist
from nltk.internals import java, config_java

from nltk.classify.api import ClassifierI

_weka_classpath = None
_weka_search = ['.',
                '/usr/share/weka',
                '/usr/local/share/weka',
                '/usr/lib/weka',
                '/usr/local/lib/weka',]
def config_weka(classpath=None):
    global _weka_classpath

    # Make sure java's configured first.
    config_java()

    if classpath is not None:
        _weka_classpath = classpath

    if _weka_classpath is None:
        searchpath = _weka_search
        if 'WEKAHOME' in os.environ:
            searchpath.insert(0, os.environ['WEKAHOME'])

        for path in searchpath:
            if os.path.exists(os.path.join(path, 'weka.jar')):
                _weka_classpath = os.path.join(path, 'weka.jar')
                version = _check_weka_version(_weka_classpath)
                if version:
                    print(('[Found Weka: %s (version %s)]' %
                           (_weka_classpath, version)))
                else:
                    print('[Found Weka: %s]' % _weka_classpath)
                _check_weka_version(_weka_classpath)

    if _weka_classpath is None:
        raise LookupError('Unable to find weka.jar!  Use config_weka() '
                          'or set the WEKAHOME environment variable. '
                          'For more information about Weka, please see '
                          'http://www.cs.waikato.ac.nz/ml/weka/')

def _check_weka_version(jar):
    try:
        zf = zipfile.ZipFile(jar)
    except (SystemExit, KeyboardInterrupt):
        raise
    except:
        return None
    try:
        try:
            return zf.read('weka/core/version.txt')
        except KeyError:
            return None
    finally:
        zf.close()

class WekaClassifier(ClassifierI):
    def __init__(self, formatter, model_filename):
        self._formatter = formatter
        self._model = model_filename

    def prob_classify_many(self, featuresets):
        return self._classify_many(featuresets, ['-p', '0', '-distribution'])

    def classify_many(self, featuresets):
        return self._classify_many(featuresets, ['-p', '0'])

    def _classify_many(self, featuresets, options):
        # Make sure we can find java & weka.
        config_weka()

        temp_dir = tempfile.mkdtemp()
        try:
            # Write the test data file.
            test_filename = os.path.join(temp_dir, 'test.arff')
            self._formatter.write(test_filename, featuresets)

            # Call weka to classify the data.
            cmd = ['weka.classifiers.bayes.NaiveBayes',
                   '-l', self._model, '-T', test_filename] + options
            (stdout, stderr) = java(cmd, classpath=_weka_classpath,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)

            # Check if something went wrong:
            if stderr and not stdout:
                if 'Illegal options: -distribution' in stderr:
                    raise ValueError('The installed version of weka does '
                                     'not support probability distribution '
                                     'output.')
                else:
                    raise ValueError('Weka failed to generate output:\n%s'
                                     % stderr)

            # Parse weka's output.
            return self.parse_weka_output(stdout.decode(stdin.encoding).split('\n'))

        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)

    def parse_weka_distribution(self, s):
        probs = [float(v) for v in re.split('[*,]+', s) if v.strip()]
        probs = dict(zip(self._formatter.labels(), probs))
        return DictionaryProbDist(probs)

    def parse_weka_output(self, lines):
        # Strip unwanted text from stdout
        for i,line in enumerate(lines):
            if line.strip().startswith("inst#"):
                lines = lines[i:]
                break

        if lines[0].split() == ['inst#', 'actual', 'predicted',
                                'error', 'prediction']:
            return [line.split()[2].split(':')[1]
                    for line in lines[1:] if line.strip()]
        elif lines[0].split() == ['inst#', 'actual', 'predicted',
                                  'error', 'distribution']:
            return [self.parse_weka_distribution(line.split()[-1])
                    for line in lines[1:] if line.strip()]

        # is this safe:?
        elif re.match(r'^0 \w+ [01]\.[0-9]* \?\s*$', lines[0]):
            return [line.split()[1] for line in lines if line.strip()]

        else:
            for line in lines[:10]:
                print(line)
            raise ValueError('Unhandled output format -- your version '
                             'of weka may not be supported.\n'
                             '  Header: %s' % lines[0])


    # [xx] full list of classifiers (some may be abstract?):
    # ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
    # DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,
    # JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,
    # LogisticBase, M5Base, MultilayerPerceptron,
    # MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,
    # NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,
    # PreConstructedLinearModel, Prism, RandomForest,
    # RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,
    # RuleNode, SimpleLinearRegression, SimpleLogistic,
    # SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,
    # VotedPerceptron, Winnow, ZeroR

    _CLASSIFIER_CLASS = {
        'naivebayes': 'weka.classifiers.bayes.NaiveBayes',
        'C4.5': 'weka.classifiers.trees.J48',
        'log_regression': 'weka.classifiers.functions.Logistic',
        'svm': 'weka.classifiers.functions.SMO',
        'kstar': 'weka.classifiers.lazy.KStar',
        'ripper': 'weka.classifiers.rules.JRip',
        }
    @classmethod
    def train(cls, model_filename, featuresets,
              classifier='naivebayes', options=[], quiet=True):
        # Make sure we can find java & weka.
        config_weka()

        # Build an ARFF formatter.
        formatter = ARFF_Formatter.from_train(featuresets)

        temp_dir = tempfile.mkdtemp()
        try:
            # Write the training data file.
            train_filename = os.path.join(temp_dir, 'train.arff')
            formatter.write(train_filename, featuresets)

            if classifier in cls._CLASSIFIER_CLASS:
                javaclass = cls._CLASSIFIER_CLASS[classifier]
            elif classifier in cls._CLASSIFIER_CLASS.values():
                javaclass = classifier
            else:
                raise ValueError('Unknown classifier %s' % classifier)

            # Train the weka model.
            cmd = [javaclass, '-d', model_filename, '-t', train_filename]
            cmd += list(options)
            if quiet:
                stdout = subprocess.PIPE
            else: stdout = None
            java(cmd, classpath=_weka_classpath, stdout=stdout)

            # Return the new classifier.
            return WekaClassifier(formatter, model_filename)

        finally:
            for f in os.listdir(temp_dir):
                os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)


class ARFF_Formatter:
    """
    Converts featuresets and labeled featuresets to ARFF-formatted
    strings, appropriate for input into Weka.

    Features and classes can be specified manually in the constructor, or may
    be determined from data using ``from_train``.
    """

    def __init__(self, labels, features):
        """
        :param labels: A list of all class labels that can be generated.
        :param features: A list of feature specifications, where
            each feature specification is a tuple (fname, ftype);
            and ftype is an ARFF type string such as NUMERIC or
            STRING.
        """
        self._labels = labels
        self._features = features

    def format(self, tokens):
        """Returns a string representation of ARFF output for the given data."""
        return self.header_section() + self.data_section(tokens)

    def labels(self):
        """Returns the list of classes."""
        return list(self._labels)

    def write(self, outfile, tokens):
        """Writes ARFF data to a file for the given data."""
        if not hasattr(outfile, 'write'):
            outfile = open(outfile, 'w')
        outfile.write(self.format(tokens))
        outfile.close()

    @staticmethod
    def from_train(tokens):
        """
        Constructs an ARFF_Formatter instance with class labels and feature
        types determined from the given data. Handles boolean, numeric and
        string (note: not nominal) types.
        """
        # Find the set of all attested labels.
        labels = set(label for (tok, label) in tokens)

        # Determine the types of all features.
        features = {}
        for tok, label in tokens:
            for (fname, fval) in tok.items():
                if issubclass(type(fval), bool):
                    ftype = '{True, False}'
                elif issubclass(type(fval), (integer_types, float, bool)):
                    ftype = 'NUMERIC'
                elif issubclass(type(fval), string_types):
                    ftype = 'STRING'
                elif fval is None:
                    continue # can't tell the type.
                else:
                    raise ValueError('Unsupported value type %r' % ftype)

                if features.get(fname, ftype) != ftype:
                    raise ValueError('Inconsistent type for %s' % fname)
                features[fname] = ftype
        features = sorted(features.items())

        return ARFF_Formatter(labels, features)

    def header_section(self):
        """Returns an ARFF header as a string."""
        # Header comment.
        s = ('% Weka ARFF file\n' +
             '% Generated automatically by NLTK\n' +
             '%% %s\n\n' % time.ctime())

        # Relation name
        s += '@RELATION rel\n\n'

        # Input attribute specifications
        for fname, ftype in self._features:
            s += '@ATTRIBUTE %-30r %s\n' % (fname, ftype)

        # Label attribute specification
        s += '@ATTRIBUTE %-30r {%s}\n' % ('-label-', ','.join(self._labels))

        return s

    def data_section(self, tokens, labeled=None):
        """
        Returns the ARFF data section for the given data.

        :param tokens: a list of featuresets (dicts) or labelled featuresets
            which are tuples (featureset, label).
        :param labeled: Indicates whether the given tokens are labeled
            or not.  If None, then the tokens will be assumed to be
            labeled if the first token's value is a tuple or list.
        """
        # Check if the tokens are labeled or unlabeled.  If unlabeled,
        # then use 'None'
        if labeled is None:
            labeled = tokens and isinstance(tokens[0], (tuple, list))
        if not labeled:
            tokens = [(tok, None) for tok in tokens]

        # Data section
        s = '\n@DATA\n'
        for (tok, label) in tokens:
            for fname, ftype in self._features:
                s += '%s,' % self._fmt_arff_val(tok.get(fname))
            s += '%s\n' % self._fmt_arff_val(label)

        return s

    def _fmt_arff_val(self, fval):
        if fval is None:
            return '?'
        elif isinstance(fval, (bool, integer_types)):
            return '%s' % fval
        elif isinstance(fval, float):
            return '%r' % fval
        else:
            return '%r' % fval


if __name__ == '__main__':
    from nltk.classify.util import names_demo, binary_names_demo_features
    def make_classifier(featuresets):
        return WekaClassifier.train('/tmp/name.model', featuresets,
                                    'C4.5')
    classifier = names_demo(make_classifier, binary_names_demo_features)
first commit 2020-08-27 21:55:39 +02:00			`# Natural Language Toolkit: Interface to Weka Classsifiers`
			`#`
			`# Copyright (C) 2001-2018 NLTK Project`
			`# Author: Edward Loper <edloper@gmail.com>`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`"""`
			`Classifiers that make use of the external 'Weka' package.`
			`"""`
			`from __future__ import print_function`
			`import time`
			`import tempfile`
			`import os`
			`import subprocess`
			`import re`
			`import zipfile`
			`from sys import stdin`

			`from six import integer_types, string_types`

			`from nltk.probability import DictionaryProbDist`
			`from nltk.internals import java, config_java`

			`from nltk.classify.api import ClassifierI`

			`_weka_classpath = None`
			`_weka_search = ['.',`
			`'/usr/share/weka',`
			`'/usr/local/share/weka',`
			`'/usr/lib/weka',`
			`'/usr/local/lib/weka',]`
			`def config_weka(classpath=None):`
			`global _weka_classpath`

			`# Make sure java's configured first.`
			`config_java()`

			`if classpath is not None:`
			`_weka_classpath = classpath`

			`if _weka_classpath is None:`
			`searchpath = _weka_search`
			`if 'WEKAHOME' in os.environ:`
			`searchpath.insert(0, os.environ['WEKAHOME'])`

			`for path in searchpath:`
			`if os.path.exists(os.path.join(path, 'weka.jar')):`
			`_weka_classpath = os.path.join(path, 'weka.jar')`
			`version = _check_weka_version(_weka_classpath)`
			`if version:`
			`print(('[Found Weka: %s (version %s)]' %`
			`(_weka_classpath, version)))`
			`else:`
			`print('[Found Weka: %s]' % _weka_classpath)`
			`_check_weka_version(_weka_classpath)`

			`if _weka_classpath is None:`
			`raise LookupError('Unable to find weka.jar! Use config_weka() '`
			`'or set the WEKAHOME environment variable. '`
			`'For more information about Weka, please see '`
			`'http://www.cs.waikato.ac.nz/ml/weka/')`

			`def _check_weka_version(jar):`
			`try:`
			`zf = zipfile.ZipFile(jar)`
			`except (SystemExit, KeyboardInterrupt):`
			`raise`
			`except:`
			`return None`
			`try:`
			`try:`
			`return zf.read('weka/core/version.txt')`
			`except KeyError:`
			`return None`
			`finally:`
			`zf.close()`

			`class WekaClassifier(ClassifierI):`
			`def __init__(self, formatter, model_filename):`
			`self._formatter = formatter`
			`self._model = model_filename`

			`def prob_classify_many(self, featuresets):`
			`return self._classify_many(featuresets, ['-p', '0', '-distribution'])`

			`def classify_many(self, featuresets):`
			`return self._classify_many(featuresets, ['-p', '0'])`

			`def _classify_many(self, featuresets, options):`
			`# Make sure we can find java & weka.`
			`config_weka()`

			`temp_dir = tempfile.mkdtemp()`
			`try:`
			`# Write the test data file.`
			`test_filename = os.path.join(temp_dir, 'test.arff')`
			`self._formatter.write(test_filename, featuresets)`

			`# Call weka to classify the data.`
			`cmd = ['weka.classifiers.bayes.NaiveBayes',`
			`'-l', self._model, '-T', test_filename] + options`
			`(stdout, stderr) = java(cmd, classpath=_weka_classpath,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE)`

			`# Check if something went wrong:`
			`if stderr and not stdout:`
			`if 'Illegal options: -distribution' in stderr:`
			`raise ValueError('The installed version of weka does '`
			`'not support probability distribution '`
			`'output.')`
			`else:`
			`raise ValueError('Weka failed to generate output:\n%s'`
			`% stderr)`

			`# Parse weka's output.`
			`return self.parse_weka_output(stdout.decode(stdin.encoding).split('\n'))`

			`finally:`
			`for f in os.listdir(temp_dir):`
			`os.remove(os.path.join(temp_dir, f))`
			`os.rmdir(temp_dir)`

			`def parse_weka_distribution(self, s):`
			`probs = [float(v) for v in re.split('[*,]+', s) if v.strip()]`
			`probs = dict(zip(self._formatter.labels(), probs))`
			`return DictionaryProbDist(probs)`

			`def parse_weka_output(self, lines):`
			`# Strip unwanted text from stdout`
			`for i,line in enumerate(lines):`
			`if line.strip().startswith("inst#"):`
			`lines = lines[i:]`
			`break`

			`if lines[0].split() == ['inst#', 'actual', 'predicted',`
			`'error', 'prediction']:`
			`return [line.split()[2].split(':')[1]`
			`for line in lines[1:] if line.strip()]`
			`elif lines[0].split() == ['inst#', 'actual', 'predicted',`
			`'error', 'distribution']:`
			`return [self.parse_weka_distribution(line.split()[-1])`
			`for line in lines[1:] if line.strip()]`

			`# is this safe:?`
			`elif re.match(r'^0 \w+ [01]\.[0-9]* \?\s*$', lines[0]):`
			`return [line.split()[1] for line in lines if line.strip()]`

			`else:`
			`for line in lines[:10]:`
			`print(line)`
			`raise ValueError('Unhandled output format -- your version '`
			`'of weka may not be supported.\n'`
			`' Header: %s' % lines[0])`


			`# [xx] full list of classifiers (some may be abstract?):`
			`# ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,`
			`# DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,`
			`# JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,`
			`# LogisticBase, M5Base, MultilayerPerceptron,`
			`# MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,`
			`# NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,`
			`# PreConstructedLinearModel, Prism, RandomForest,`
			`# RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,`
			`# RuleNode, SimpleLinearRegression, SimpleLogistic,`
			`# SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,`
			`# VotedPerceptron, Winnow, ZeroR`

			`_CLASSIFIER_CLASS = {`
			`'naivebayes': 'weka.classifiers.bayes.NaiveBayes',`
			`'C4.5': 'weka.classifiers.trees.J48',`
			`'log_regression': 'weka.classifiers.functions.Logistic',`
			`'svm': 'weka.classifiers.functions.SMO',`
			`'kstar': 'weka.classifiers.lazy.KStar',`
			`'ripper': 'weka.classifiers.rules.JRip',`
			`}`
			`@classmethod`
			`def train(cls, model_filename, featuresets,`
			`classifier='naivebayes', options=[], quiet=True):`
			`# Make sure we can find java & weka.`
			`config_weka()`

			`# Build an ARFF formatter.`
			`formatter = ARFF_Formatter.from_train(featuresets)`

			`temp_dir = tempfile.mkdtemp()`
			`try:`
			`# Write the training data file.`
			`train_filename = os.path.join(temp_dir, 'train.arff')`
			`formatter.write(train_filename, featuresets)`

			`if classifier in cls._CLASSIFIER_CLASS:`
			`javaclass = cls._CLASSIFIER_CLASS[classifier]`
			`elif classifier in cls._CLASSIFIER_CLASS.values():`
			`javaclass = classifier`
			`else:`
			`raise ValueError('Unknown classifier %s' % classifier)`

			`# Train the weka model.`
			`cmd = [javaclass, '-d', model_filename, '-t', train_filename]`
			`cmd += list(options)`
			`if quiet:`
			`stdout = subprocess.PIPE`
			`else: stdout = None`
			`java(cmd, classpath=_weka_classpath, stdout=stdout)`

			`# Return the new classifier.`
			`return WekaClassifier(formatter, model_filename)`

			`finally:`
			`for f in os.listdir(temp_dir):`
			`os.remove(os.path.join(temp_dir, f))`
			`os.rmdir(temp_dir)`


			`class ARFF_Formatter:`
			`"""`
			`Converts featuresets and labeled featuresets to ARFF-formatted`
			`strings, appropriate for input into Weka.`

			`Features and classes can be specified manually in the constructor, or may`
			be determined from data using ``from_train``.
			`"""`

			`def __init__(self, labels, features):`
			`"""`
			`:param labels: A list of all class labels that can be generated.`
			`:param features: A list of feature specifications, where`
			`each feature specification is a tuple (fname, ftype);`
			`and ftype is an ARFF type string such as NUMERIC or`
			`STRING.`
			`"""`
			`self._labels = labels`
			`self._features = features`

			`def format(self, tokens):`
			`"""Returns a string representation of ARFF output for the given data."""`
			`return self.header_section() + self.data_section(tokens)`

			`def labels(self):`
			`"""Returns the list of classes."""`
			`return list(self._labels)`

			`def write(self, outfile, tokens):`
			`"""Writes ARFF data to a file for the given data."""`
			`if not hasattr(outfile, 'write'):`
			`outfile = open(outfile, 'w')`
			`outfile.write(self.format(tokens))`
			`outfile.close()`

			`@staticmethod`
			`def from_train(tokens):`
			`"""`
			`Constructs an ARFF_Formatter instance with class labels and feature`
			`types determined from the given data. Handles boolean, numeric and`
			`string (note: not nominal) types.`
			`"""`
			`# Find the set of all attested labels.`
			`labels = set(label for (tok, label) in tokens)`

			`# Determine the types of all features.`
			`features = {}`
			`for tok, label in tokens:`
			`for (fname, fval) in tok.items():`
			`if issubclass(type(fval), bool):`
			`ftype = '{True, False}'`
			`elif issubclass(type(fval), (integer_types, float, bool)):`
			`ftype = 'NUMERIC'`
			`elif issubclass(type(fval), string_types):`
			`ftype = 'STRING'`
			`elif fval is None:`
			`continue # can't tell the type.`
			`else:`
			`raise ValueError('Unsupported value type %r' % ftype)`

			`if features.get(fname, ftype) != ftype:`
			`raise ValueError('Inconsistent type for %s' % fname)`
			`features[fname] = ftype`
			`features = sorted(features.items())`

			`return ARFF_Formatter(labels, features)`

			`def header_section(self):`
			`"""Returns an ARFF header as a string."""`
			`# Header comment.`
			`s = ('% Weka ARFF file\n' +`
			`'% Generated automatically by NLTK\n' +`
			`'%% %s\n\n' % time.ctime())`

			`# Relation name`
			`s += '@RELATION rel\n\n'`

			`# Input attribute specifications`
			`for fname, ftype in self._features:`
			`s += '@ATTRIBUTE %-30r %s\n' % (fname, ftype)`

			`# Label attribute specification`
			`s += '@ATTRIBUTE %-30r {%s}\n' % ('-label-', ','.join(self._labels))`

			`return s`

			`def data_section(self, tokens, labeled=None):`
			`"""`
			`Returns the ARFF data section for the given data.`

			`:param tokens: a list of featuresets (dicts) or labelled featuresets`
			`which are tuples (featureset, label).`
			`:param labeled: Indicates whether the given tokens are labeled`
			`or not. If None, then the tokens will be assumed to be`
			`labeled if the first token's value is a tuple or list.`
			`"""`
			`# Check if the tokens are labeled or unlabeled. If unlabeled,`
			`# then use 'None'`
			`if labeled is None:`
			`labeled = tokens and isinstance(tokens[0], (tuple, list))`
			`if not labeled:`
			`tokens = [(tok, None) for tok in tokens]`

			`# Data section`
			`s = '\n@DATA\n'`
			`for (tok, label) in tokens:`
			`for fname, ftype in self._features:`
			`s += '%s,' % self._fmt_arff_val(tok.get(fname))`
			`s += '%s\n' % self._fmt_arff_val(label)`

			`return s`

			`def _fmt_arff_val(self, fval):`
			`if fval is None:`
			`return '?'`
			`elif isinstance(fval, (bool, integer_types)):`
			`return '%s' % fval`
			`elif isinstance(fval, float):`
			`return '%r' % fval`
			`else:`
			`return '%r' % fval`


			`if __name__ == '__main__':`
			`from nltk.classify.util import names_demo, binary_names_demo_features`
			`def make_classifier(featuresets):`
			`return WekaClassifier.train('/tmp/name.model', featuresets,`
			`'C4.5')`
			`classifier = names_demo(make_classifier, binary_names_demo_features)`