alpcentaur
/
basabuuka_prototyp



								import pandas as pd

								import numpy as np


								from bs4 import BeautifulSoup

								import re


								import nltk


								from nltk.stem.snowball import SnowballStemmer

								from nltk.corpus import stopwords # Import the stop word list


								from sklearn.linear_model import SGDClassifier

								from sklearn import svm

								import scipy

								from sklearn import preprocessing


								from sklearn.externals import joblib


								def comment_to_words( raw_comment ):

								    # Function to convert a raw comment to a string of words

								    # The input is a single string (a raw comment), and

								    # the output is a single string (a preprocessed comment)

								    #

								    # 1. Remove HTML

								    comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()

								    #

								    # 2. Remove non-letters

								    #letters_only = re.sub("[^a-zA-Z]", " ", comment_text)

								    #

								    # 3. Convert to lower case, split into individual words

								    words = comment_text.split()

								    #words = letters_only.split()

								    #

								    # 4. In Python, searching a set is much faster than searching

								    #   a list, so convert the stop words to a set

								    stops = set(stopwords.words("german"))

								    #

								    # 5. Remove stop words

								    #meaningful_words = [w for w in words if not w in stops]

								    meaningful_words = [w for w in words]

								    #

								    # 6. Join the words back into one string separated by space,

								    # and return the result.

								    return( " ".join( meaningful_words ))


								class SentGlueMach(object):


								    def __init__(self, SGDModel, bagofwords):


								        self.sgdmodel = SGDModel

								        self.bow = bagofwords


								    def initialize(self):


								        from sklearn.feature_extraction.text import CountVectorizer


								        #print('loading vectorizer..')

								        self.vectorizer = joblib.load(self.bow)

								        #print('done')


								        #print('loading the SGD model..')

								        self.clf = joblib.load(self.sgdmodel)

								        #print('done')


								        #print('loading spacy..')

								        import spacy

								        self.nlp = spacy.load('de_core_news_sm')

								        #print('done')


								    def predictandevalOnCsv(self, csvdata):


								        self.comment_in = pd.read_csv(csvdata, header=0, quotechar='"',

								                            delimiter=',')


								        #print('here ist the comment in format',self.comment_in)


								        self.num_comments_in = self.comment_in["Kommentar"].size


								        #print( "Cleaning and parsing the valuation set comments...\n")

								        clean_comments_in = []

								        for i in range( 0, self.num_comments_in ):

								            # If the index is evenly divisible by 1000, print a message

								            #if( (i+1)%1000 == 0 ):

								                #print("comment %d of %d\n" % ( i+1, self.num_comments_in ))

								            #print(self.comment_in["Kommentar"][i])

								            clean_comments_in.append( comment_to_words( self.comment_in["Kommentar"][i] ))


								        #print(clean_comments_in)


								        comments_in_vector = self.vectorizer.transform(clean_comments_in)


								        #print('da comments', comments_in_vector)


								        # Numpy arrays are easy to work with, so convert the result to an

								        # array

								        self.comments_in_vector = comments_in_vector.toarray()

								        #print(comments_in_vector)

								        #print('here are the comments in vector, input to the predictmach', self.comments_in_vector)


								        X_val = self.clf.predict(self.comments_in_vector)


								        #print( X_val)


								        Y_val = []

								        for n in range(self.num_comments_in):

								            Y_val.append(self.comment_in["correct or not"][n])


								        for r in range(self.num_comments_in):

								                d = Y_val[r]

								                if d == ' correct':

								                    Y_val[r] = 1

								                else:

								                    Y_val[r] = 0


								        XY_val = np.zeros((2,self.num_comments_in))


								        for n in range(self.num_comments_in):

								            XY_val[0][n] = X_val[n]

								            XY_val[1][n] = Y_val[n]


								        #print(XY_val)


								        count = 0

								        for n in range(self.num_comments_in):

								            if XY_val[0][n] == XY_val[1][n]:

								                count = count + 1


								        #print('Die Anzahl der richtigen Aussagen betraegt:', count)


								        Proz = (count*100)/self.num_comments_in


								        #print('Prozentual wurde richtig geraten:', Proz, '%')


								        return XY_val , Proz


								    #input: ['this is sentence a', 'and then comes the next sentence', 'and so on'], [ 'the ori sentence is']


								    def predictprobsOnSentenceList(self, beforeprocessingSentenceList, orisentence):


								        oridocs = 0


								        if type(orisentence) == str:


								            oridoc = self.nlp(orisentence)

								        else:

								            oridocs = 1


								        SentenceList =[]

								        count = 0

								        for sentence in beforeprocessingSentenceList:


								            count += 1

								            doc = self.nlp(sentence)


								            if oridocs == 1:


								                oridoc = self.nlp(orisentence[count - 1])


								            #if count % 100 == 0:

								                #print(count)

								            depssentence = []

								            tagssentence = []

								            for word in doc:

								                for word2 in oridoc:

								                    if word.text == word2.text:

								                        depssentence.append(word2.dep_)

								                        break


								                tagssentence.append(word.tag_)

								            deps = ' '

								            tags = ' '

								            for x in depssentence:

								                #print(x)

								                deps += str(x) + ' '

								                tags += str(x) + ' '


								            #print('a',type(sentence))

								            #print('nb', type(deps))


								            processedsentence = '"' + sentence + deps + tags +'"'


								            SentenceList.append(processedsentence)


								        #print(SentenceList)

								        #print( "Cleaning and parsing the valuation set comments...\n")

								        clean_comments_in = []

								        numSentenceList = len(SentenceList)

								        for i in range( 0, numSentenceList ):

								            # If the index is evenly divisible by 1000, print a message

								            #if( (i+1)%1000 == 0 ):

								                #print("comment %d of %d\n" % ( i+1, numSentenceList ))

								            #print(SentenceList[i])

								            clean_comments_in.append( comment_to_words( SentenceList[i] ))


								        #print(clean_comments_in)


								        comments_in_vector = self.vectorizer.transform(clean_comments_in)


								        #print('da comments', comments_in_vector)


								        # Numpy arrays are easy to work with, so convert the result to an

								        # array

								        self.comments_in_vector = comments_in_vector.toarray()


								        Prob_perclass = self.clf.predict_proba(self.comments_in_vector)


								        return Prob_perclass


								    def GeneratePermutationsOfSentence(self, sentence):


								        import itertools

								        permutations = list(itertools.permutations(sentence))


								        output = []

								        for perm in permutations:

								            output.append(list(perm))


								        return output


								    def GetBestSentenceFromSentencesAccordingToGrammar(self, sentences, orisentence):


								        probsMatrix = self.predictprobsOnSentenceList(sentences, orisentence)


								        #print(probsMatrix)


								        for i in range(len(probsMatrix)):

								            probsMatrix[i][0] = i


								        #print(probsMatrix)


								        sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)


								        #print(sortedprobsMatrix)


								        bestindex = sortedprobsMatrix[0][0]


								        #print(bestindex)

								        #print('probablemainsentences', filteredprobsentences)

								        bestsentence = sentences[int(bestindex)]


								        return bestsentence