basabuuka_prototyp/Prototyp/SentGlue.py



import pandas as pd
import numpy as np


from bs4 import BeautifulSoup
import re

import nltk

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # Import the stop word list

from sklearn.linear_model import SGDClassifier
from sklearn import svm
import scipy
from sklearn import preprocessing

from sklearn.externals import joblib

def comment_to_words( raw_comment ):
    # Function to convert a raw comment to a string of words
    # The input is a single string (a raw comment), and
    # the output is a single string (a preprocessed comment)
    #
    # 1. Remove HTML
    comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
    #
    # 2. Remove non-letters
    #letters_only = re.sub("[^a-zA-Z]", " ", comment_text)
    #
    # 3. Convert to lower case, split into individual words
    words = comment_text.split()
    #words = letters_only.split()
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("german"))
    #
    # 5. Remove stop words
    #meaningful_words = [w for w in words if not w in stops]
    meaningful_words = [w for w in words]
    #
    # 6. Join the words back into one string separated by space,
    # and return the result.
    return( " ".join( meaningful_words ))


class SentGlueMach(object):

    def __init__(self, SGDModel, bagofwords):

        self.sgdmodel = SGDModel
        self.bow = bagofwords


    def initialize(self):


        from sklearn.feature_extraction.text import CountVectorizer

        #print('loading vectorizer..')
        self.vectorizer = joblib.load(self.bow)
        #print('done')


        #print('loading the SGD model..')
        self.clf = joblib.load(self.sgdmodel)
        #print('done')

        #print('loading spacy..')
        import spacy
        self.nlp = spacy.load('de_core_news_sm')
        #print('done')


    def predictandevalOnCsv(self, csvdata):

        self.comment_in = pd.read_csv(csvdata, header=0, quotechar='"',
                            delimiter=',')

        #print('here ist the comment in format',self.comment_in)

        self.num_comments_in = self.comment_in["Kommentar"].size


        #print( "Cleaning and parsing the valuation set comments...\n")
        clean_comments_in = []
        for i in range( 0, self.num_comments_in ):
            # If the index is evenly divisible by 1000, print a message
            #if( (i+1)%1000 == 0 ):
                #print("comment %d of %d\n" % ( i+1, self.num_comments_in ))
            #print(self.comment_in["Kommentar"][i])
            clean_comments_in.append( comment_to_words( self.comment_in["Kommentar"][i] ))


        #print(clean_comments_in)

        comments_in_vector = self.vectorizer.transform(clean_comments_in)

        #print('da comments', comments_in_vector)

        # Numpy arrays are easy to work with, so convert the result to an
        # array
        self.comments_in_vector = comments_in_vector.toarray()
        #print(comments_in_vector)
        #print('here are the comments in vector, input to the predictmach', self.comments_in_vector)


        X_val = self.clf.predict(self.comments_in_vector)


        #print( X_val)


        Y_val = []
        for n in range(self.num_comments_in):
            Y_val.append(self.comment_in["correct or not"][n])

        for r in range(self.num_comments_in):
                d = Y_val[r]
                if d == ' correct':
                    Y_val[r] = 1
                else:
                    Y_val[r] = 0


        XY_val = np.zeros((2,self.num_comments_in))

        for n in range(self.num_comments_in):
            XY_val[0][n] = X_val[n]
            XY_val[1][n] = Y_val[n]

        #print(XY_val)

        count = 0
        for n in range(self.num_comments_in):
            if XY_val[0][n] == XY_val[1][n]:
                count = count + 1


        #print('Die Anzahl der richtigen Aussagen betraegt:', count)

        Proz = (count*100)/self.num_comments_in

        #print('Prozentual wurde richtig geraten:', Proz, '%')


        return XY_val , Proz


    #input: ['this is sentence a', 'and then comes the next sentence', 'and so on'], [ 'the ori sentence is']

    def predictprobsOnSentenceList(self, beforeprocessingSentenceList, orisentence):

        oridocs = 0

        if type(orisentence) == str:

            oridoc = self.nlp(orisentence)
        else:
            oridocs = 1

        SentenceList =[]
        count = 0
        for sentence in beforeprocessingSentenceList:


            count += 1
            doc = self.nlp(sentence)

            if oridocs == 1:

                oridoc = self.nlp(orisentence[count - 1])

            #if count % 100 == 0:
                #print(count)
            depssentence = []
            tagssentence = []
            for word in doc:
                for word2 in oridoc:
                    if word.text == word2.text:
                        depssentence.append(word2.dep_)
                        break

                tagssentence.append(word.tag_)
            deps = ' '
            tags = ' '
            for x in depssentence:
                #print(x)
                deps += str(x) + ' '
                tags += str(x) + ' '

            #print('a',type(sentence))
            #print('nb', type(deps))

            processedsentence = '"' + sentence + deps + tags +'"'

            SentenceList.append(processedsentence)

        #print(SentenceList)
        #print( "Cleaning and parsing the valuation set comments...\n")
        clean_comments_in = []
        numSentenceList = len(SentenceList)
        for i in range( 0, numSentenceList ):
            # If the index is evenly divisible by 1000, print a message
            #if( (i+1)%1000 == 0 ):
                #print("comment %d of %d\n" % ( i+1, numSentenceList ))
            #print(SentenceList[i])
            clean_comments_in.append( comment_to_words( SentenceList[i] ))


        #print(clean_comments_in)

        comments_in_vector = self.vectorizer.transform(clean_comments_in)

        #print('da comments', comments_in_vector)

        # Numpy arrays are easy to work with, so convert the result to an
        # array
        self.comments_in_vector = comments_in_vector.toarray()


        Prob_perclass = self.clf.predict_proba(self.comments_in_vector)

        return Prob_perclass


    def GeneratePermutationsOfSentence(self, sentence):

        import itertools
        permutations = list(itertools.permutations(sentence))

        output = []
        for perm in permutations:
            output.append(list(perm))

        return output

    def GetBestSentenceFromSentencesAccordingToGrammar(self, sentences, orisentence):


        probsMatrix = self.predictprobsOnSentenceList(sentences, orisentence)

        #print(probsMatrix)

        for i in range(len(probsMatrix)):
            probsMatrix[i][0] = i

        #print(probsMatrix)

        sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)

        #print(sortedprobsMatrix)

        bestindex = sortedprobsMatrix[0][0]

        #print(bestindex)
        #print('probablemainsentences', filteredprobsentences)
        bestsentence = sentences[int(bestindex)]


        return bestsentence