laywerrobot/lib/python3.6/site-packages/nltk/classify/textcat.py

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language ID module using TextCat algorithm
#
# Copyright (C) 2001-2018 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle, 
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses 
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created seperately to read
those files.

For details regarding the algorithm, see:
http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
http://borel.slu.edu/crubadan/index.html
"""

# Ensure that literal strings default to unicode rather than str.
from __future__ import print_function, unicode_literals

from nltk.compat import PY3
from nltk.util import trigrams

if PY3:
    from sys import maxsize
else:
    from sys import maxint

# Note: this is NOT "re" you're likely used to. The regex module
# is an alternative to the standard re module that supports
# Unicode codepoint properties with the \p{} syntax.
# You may have to "pip install regx"
try:
    import regex as re
except ImportError:
    re = None
######################################################################
##  Language identification using TextCat
######################################################################

class TextCat(object):

    _corpus = None
    fingerprints = {}
    _START_CHAR = "<"
    _END_CHAR = ">"
    
    last_distances = {}
    
    def __init__(self):
        if not re:
            raise EnvironmentError("classify.textcat requires the regex module that "
                                   "supports unicode. Try '$ pip install regex' and "
                                   "see https://pypi.python.org/pypi/regex for "
                                   "further details.")

        from nltk.corpus import crubadan
        self._corpus = crubadan
        # Load all language ngrams into cache
        for lang in self._corpus.langs():
            self._corpus.lang_freq(lang)
        
    def remove_punctuation(self, text):
        ''' Get rid of punctuation except apostrophes '''
        return re.sub(r"[^\P{P}\']+", "", text)
    
    def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint
        
    def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist
        
    def lang_dists(self, text):
        ''' Calculate the "out-of-place" measure between
            the text and all languages '''
        
        distances = {}
        profile = self.profile(text)
        # For all the languages
        for lang in self._corpus._all_lang_freq.keys():
            # Calculate distance metric for every trigram in
            # input text to be identified
            lang_dist = 0
            for trigram in profile:
                lang_dist += self.calc_dist(lang, trigram, profile)
        
            distances[lang] = lang_dist
            
        return distances
    
    def guess_language(self, text):
        ''' Find the language with the min distance
            to the text and return its ISO 639-3 code '''
        self.last_distances = self.lang_dists(text)
        
        return min(self.last_distances, key=self.last_distances.get)
        #################################################')

def demo():
    from nltk.corpus import udhr

    langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',
             'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',
             'Serbian_Srpski-UTF8','Esperanto-UTF8']

    friendly = {'kmr':'Northern Kurdish',
                'abk':'Abkhazian',
                'pes':'Iranian Persian',
                'hin':'Hindi',
                'haw':'Hawaiian',
                'rus':'Russian',
                'vie':'Vietnamese',
                'srp':'Serbian',
                'epo':'Esperanto'}
        
    tc = TextCat()

    for cur_lang in langs:
        # Get raw data from UDHR corpus
        raw_sentences = udhr.sents(cur_lang)
        rows = len(raw_sentences) - 1
        cols = list(map(len, raw_sentences))

        sample = ''
          
        # Generate a sample text of the language
        for i in range(0, rows):
            cur_sent = ''
            for j in range(0, cols[i]):
                cur_sent += ' ' + raw_sentences[i][j]
            
            sample += cur_sent
          
        # Try to detect what it is
        print('Language snippet: ' + sample[0:140] + '...')
        guess = tc.guess_language(sample)
        print('Language detection: %s (%s)' % (guess, friendly[guess]))
        print('#' * 140)


if __name__ == '__main__':
    demo()
first commit 2020-08-27 21:55:39 +02:00			`# -- coding: utf-8 --`
			`# Natural Language Toolkit: Language ID module using TextCat algorithm`
			`#`
			`# Copyright (C) 2001-2018 NLTK Project`
			`# Author: Avital Pekker <avital.pekker@utoronto.ca>`
			`#`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`"""`
			`A module for language identification using the TextCat algorithm.`
			`An implementation of the text categorization algorithm`
			`presented in Cavnar, W. B. and J. M. Trenkle,`
			`"N-Gram-Based Text Categorization".`

			`The algorithm takes advantage of Zipf's law and uses`
			`n-gram frequencies to profile languages and text-yet to`
			`be identified-then compares using a distance measure.`

			`Language n-grams are provided by the "An Crubadan"`
			`project. A corpus reader was created seperately to read`
			`those files.`

			`For details regarding the algorithm, see:`
			`http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf`

			`For details about An Crubadan, see:`
			`http://borel.slu.edu/crubadan/index.html`
			`"""`

			`# Ensure that literal strings default to unicode rather than str.`
			`from __future__ import print_function, unicode_literals`

			`from nltk.compat import PY3`
			`from nltk.util import trigrams`

			`if PY3:`
			`from sys import maxsize`
			`else:`
			`from sys import maxint`

			`# Note: this is NOT "re" you're likely used to. The regex module`
			`# is an alternative to the standard re module that supports`
			`# Unicode codepoint properties with the \p{} syntax.`
			`# You may have to "pip install regx"`
			`try:`
			`import regex as re`
			`except ImportError:`
			`re = None`
			`######################################################################`
			`## Language identification using TextCat`
			`######################################################################`

			`class TextCat(object):`

			`_corpus = None`
			`fingerprints = {}`
			`_START_CHAR = "<"`
			`_END_CHAR = ">"`

			`last_distances = {}`

			`def __init__(self):`
			`if not re:`
			`raise EnvironmentError("classify.textcat requires the regex module that "`
			`"supports unicode. Try '$ pip install regex' and "`
			`"see https://pypi.python.org/pypi/regex for "`
			`"further details.")`

			`from nltk.corpus import crubadan`
			`self._corpus = crubadan`
			`# Load all language ngrams into cache`
			`for lang in self._corpus.langs():`
			`self._corpus.lang_freq(lang)`

			`def remove_punctuation(self, text):`
			`''' Get rid of punctuation except apostrophes '''`
			`return re.sub(r"[^\P{P}\']+", "", text)`

			`def profile(self, text):`
			`''' Create FreqDist of trigrams within text '''`
			`from nltk import word_tokenize, FreqDist`

			`clean_text = self.remove_punctuation(text)`
			`tokens = word_tokenize(clean_text)`

			`fingerprint = FreqDist()`
			`for t in tokens:`
			`token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)`
			`token_trigrams = [''.join(tri) for tri in token_trigram_tuples]`

			`for cur_trigram in token_trigrams:`
			`if cur_trigram in fingerprint:`
			`fingerprint[cur_trigram] += 1`
			`else:`
			`fingerprint[cur_trigram] = 1`

			`return fingerprint`

			`def calc_dist(self, lang, trigram, text_profile):`
			`''' Calculate the "out-of-place" measure between the`
			`text and language profile for a single trigram '''`

			`lang_fd = self._corpus.lang_freq(lang)`
			`dist = 0`

			`if trigram in lang_fd:`
			`idx_lang_profile = list(lang_fd.keys()).index(trigram)`
			`idx_text = list(text_profile.keys()).index(trigram)`

			`#print(idx_lang_profile, ", ", idx_text)`
			`dist = abs(idx_lang_profile - idx_text)`
			`else:`
			`# Arbitrary but should be larger than`
			`# any possible trigram file length`
			`# in terms of total lines`
			`if PY3:`
			`dist = maxsize`
			`else:`
			`dist = maxint`

			`return dist`

			`def lang_dists(self, text):`
			`''' Calculate the "out-of-place" measure between`
			`the text and all languages '''`

			`distances = {}`
			`profile = self.profile(text)`
			`# For all the languages`
			`for lang in self._corpus._all_lang_freq.keys():`
			`# Calculate distance metric for every trigram in`
			`# input text to be identified`
			`lang_dist = 0`
			`for trigram in profile:`
			`lang_dist += self.calc_dist(lang, trigram, profile)`

			`distances[lang] = lang_dist`

			`return distances`

			`def guess_language(self, text):`
			`''' Find the language with the min distance`
			`to the text and return its ISO 639-3 code '''`
			`self.last_distances = self.lang_dists(text)`

			`return min(self.last_distances, key=self.last_distances.get)`
			`#################################################')`

			`def demo():`
			`from nltk.corpus import udhr`

			`langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',`
			`'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',`
			`'Serbian_Srpski-UTF8','Esperanto-UTF8']`

			`friendly = {'kmr':'Northern Kurdish',`
			`'abk':'Abkhazian',`
			`'pes':'Iranian Persian',`
			`'hin':'Hindi',`
			`'haw':'Hawaiian',`
			`'rus':'Russian',`
			`'vie':'Vietnamese',`
			`'srp':'Serbian',`
			`'epo':'Esperanto'}`

			`tc = TextCat()`

			`for cur_lang in langs:`
			`# Get raw data from UDHR corpus`
			`raw_sentences = udhr.sents(cur_lang)`
			`rows = len(raw_sentences) - 1`
			`cols = list(map(len, raw_sentences))`

			`sample = ''`

			`# Generate a sample text of the language`
			`for i in range(0, rows):`
			`cur_sent = ''`
			`for j in range(0, cols[i]):`
			`cur_sent += ' ' + raw_sentences[i][j]`

			`sample += cur_sent`

			`# Try to detect what it is`
			`print('Language snippet: ' + sample[0:140] + '...')`
			`guess = tc.guess_language(sample)`
			`print('Language detection: %s (%s)' % (guess, friendly[guess]))`
			`print('#' * 140)`


			`if __name__ == '__main__':`
			`demo()`