alpcentaur
/
basabuuka_prototyp



import pandas as pd import numpy as np  

from bs4 import BeautifulSoup  import re
import nltk
from nltk.stem.snowball import SnowballStemmerfrom nltk.corpus import stopwords # Import the stop word list
from sklearn.linear_model import SGDClassifierfrom sklearn import svmimport scipyfrom sklearn import preprocessing
from sklearn.externals import joblib
def comment_to_words( raw_comment ):    # Function to convert a raw comment to a string of words    # The input is a single string (a raw comment), and     # the output is a single string (a preprocessed comment)    #    # 1. Remove HTML    comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()     #    # 2. Remove non-letters            #letters_only = re.sub("[^a-zA-Z]", " ", comment_text)     #    # 3. Convert to lower case, split into individual words    words = comment_text.split()    #words = letters_only.split()                                 #    # 4. In Python, searching a set is much faster than searching    #   a list, so convert the stop words to a set    stops = set(stopwords.words("german"))                      #     # 5. Remove stop words    #meaningful_words = [w for w in words if not w in stops]       meaningful_words = [w for w in words]       #    # 6. Join the words back into one string separated by space,     # and return the result.    return( " ".join( meaningful_words ))   


class SentGlueMach(object):        def __init__(self, SGDModel, bagofwords):                self.sgdmodel = SGDModel        self.bow = bagofwords

    def initialize(self):

        from sklearn.feature_extraction.text import CountVectorizer
        #print('loading vectorizer..')        self.vectorizer = joblib.load(self.bow)        #print('done')

        #print('loading the SGD model..')        self.clf = joblib.load(self.sgdmodel)        #print('done')                #print('loading spacy..')        import spacy        self.nlp = spacy.load('de_core_news_sm')        #print('done')                        
                

    def predictandevalOnCsv(self, csvdata):                self.comment_in = pd.read_csv(csvdata, header=0, quotechar='"',                            delimiter=',')                #print('here ist the comment in format',self.comment_in)                self.num_comments_in = self.comment_in["Kommentar"].size
                        #print( "Cleaning and parsing the valuation set comments...\n")        clean_comments_in = []        for i in range( 0, self.num_comments_in ):            # If the index is evenly divisible by 1000, print a message            #if( (i+1)%1000 == 0 ):                #print("comment %d of %d\n" % ( i+1, self.num_comments_in ))                                                                                #print(self.comment_in["Kommentar"][i])            clean_comments_in.append( comment_to_words( self.comment_in["Kommentar"][i] ))

        #print(clean_comments_in)
        comments_in_vector = self.vectorizer.transform(clean_comments_in)
        #print('da comments', comments_in_vector)                # Numpy arrays are easy to work with, so convert the result to an         # array        self.comments_in_vector = comments_in_vector.toarray()        #print(comments_in_vector)        #print('here are the comments in vector, input to the predictmach', self.comments_in_vector)                        X_val = self.clf.predict(self.comments_in_vector)
        
        #print( X_val)

        Y_val = []        for n in range(self.num_comments_in):            Y_val.append(self.comment_in["correct or not"][n])
        for r in range(self.num_comments_in):                d = Y_val[r]                if d == ' correct':                    Y_val[r] = 1                else:                    Y_val[r] = 0

        XY_val = np.zeros((2,self.num_comments_in))
        for n in range(self.num_comments_in):            XY_val[0][n] = X_val[n]            XY_val[1][n] = Y_val[n]
        #print(XY_val)    
        count = 0        for n in range(self.num_comments_in):            if XY_val[0][n] == XY_val[1][n]:                count = count + 1            
        #print('Die Anzahl der richtigen Aussagen betraegt:', count)
        Proz = (count*100)/self.num_comments_in
        #print('Prozentual wurde richtig geraten:', Proz, '%')
                return XY_val , Proz
        #input: ['this is sentence a', 'and then comes the next sentence', 'and so on'], [ 'the ori sentence is']            def predictprobsOnSentenceList(self, beforeprocessingSentenceList, orisentence):                oridocs = 0                if type(orisentence) == str:                    oridoc = self.nlp(orisentence)        else:            oridocs = 1                SentenceList =[]        count = 0        for sentence in beforeprocessingSentenceList:                                                count += 1            doc = self.nlp(sentence)                        if oridocs == 1:                                oridoc = self.nlp(orisentence[count - 1])                        #if count % 100 == 0:                #print(count)            depssentence = []            tagssentence = []            for word in doc:                for word2 in oridoc:                    if word.text == word2.text:                        depssentence.append(word2.dep_)                        break                                tagssentence.append(word.tag_)            deps = ' '            tags = ' '            for x in depssentence:                #print(x)                deps += str(x) + ' '                tags += str(x) + ' '                            #print('a',type(sentence))             #print('nb', type(deps))                        processedsentence = '"' + sentence + deps + tags +'"'                        SentenceList.append(processedsentence)                #print(SentenceList)        #print( "Cleaning and parsing the valuation set comments...\n")        clean_comments_in = []        numSentenceList = len(SentenceList)        for i in range( 0, numSentenceList ):            # If the index is evenly divisible by 1000, print a message            #if( (i+1)%1000 == 0 ):                #print("comment %d of %d\n" % ( i+1, numSentenceList ))                                                                                #print(SentenceList[i])            clean_comments_in.append( comment_to_words( SentenceList[i] ))

        #print(clean_comments_in)
        comments_in_vector = self.vectorizer.transform(clean_comments_in)
        #print('da comments', comments_in_vector)                # Numpy arrays are easy to work with, so convert the result to an         # array        self.comments_in_vector = comments_in_vector.toarray()                                Prob_perclass = self.clf.predict_proba(self.comments_in_vector)                return Prob_perclass                            def GeneratePermutationsOfSentence(self, sentence):                import itertools        permutations = list(itertools.permutations(sentence))                output = []        for perm in permutations:            output.append(list(perm))                return output        def GetBestSentenceFromSentencesAccordingToGrammar(self, sentences, orisentence):                                        probsMatrix = self.predictprobsOnSentenceList(sentences, orisentence)                            #print(probsMatrix)                for i in range(len(probsMatrix)):            probsMatrix[i][0] = i                #print(probsMatrix)                sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)                #print(sortedprobsMatrix)                bestindex = sortedprobsMatrix[0][0]                #print(bestindex)        #print('probablemainsentences', filteredprobsentences)        bestsentence = sentences[int(bestindex)]                        return bestsentence