import pandas as pd import numpy as np from bs4 import BeautifulSoup import re import nltk from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords # Import the stop word list from sklearn.linear_model import SGDClassifier from sklearn import svm import scipy from sklearn import preprocessing from sklearn.externals import joblib def comment_to_words( raw_comment ): # Function to convert a raw comment to a string of words # The input is a single string (a raw comment), and # the output is a single string (a preprocessed comment) # # 1. Remove HTML comment_text = BeautifulSoup(raw_comment, "html.parser").get_text() # # 2. Remove non-letters #letters_only = re.sub("[^a-zA-Z]", " ", comment_text) # # 3. Convert to lower case, split into individual words words = comment_text.split() #words = letters_only.split() # # 4. In Python, searching a set is much faster than searching # a list, so convert the stop words to a set stops = set(stopwords.words("german")) # # 5. Remove stop words #meaningful_words = [w for w in words if not w in stops] meaningful_words = [w for w in words] # # 6. Join the words back into one string separated by space, # and return the result. return( " ".join( meaningful_words )) class SentGlueMach(object): def __init__(self, SGDModel, bagofwords): self.sgdmodel = SGDModel self.bow = bagofwords def initialize(self): from sklearn.feature_extraction.text import CountVectorizer #print('loading vectorizer..') self.vectorizer = joblib.load(self.bow) #print('done') #print('loading the SGD model..') self.clf = joblib.load(self.sgdmodel) #print('done') #print('loading spacy..') import spacy self.nlp = spacy.load('de_core_news_sm') #print('done') def predictandevalOnCsv(self, csvdata): self.comment_in = pd.read_csv(csvdata, header=0, quotechar='"', delimiter=',') #print('here ist the comment in format',self.comment_in) self.num_comments_in = self.comment_in["Kommentar"].size #print( "Cleaning and parsing the valuation set comments...\n") clean_comments_in = [] for i in range( 0, self.num_comments_in ): # If the index is evenly divisible by 1000, print a message #if( (i+1)%1000 == 0 ): #print("comment %d of %d\n" % ( i+1, self.num_comments_in )) #print(self.comment_in["Kommentar"][i]) clean_comments_in.append( comment_to_words( self.comment_in["Kommentar"][i] )) #print(clean_comments_in) comments_in_vector = self.vectorizer.transform(clean_comments_in) #print('da comments', comments_in_vector) # Numpy arrays are easy to work with, so convert the result to an # array self.comments_in_vector = comments_in_vector.toarray() #print(comments_in_vector) #print('here are the comments in vector, input to the predictmach', self.comments_in_vector) X_val = self.clf.predict(self.comments_in_vector) #print( X_val) Y_val = [] for n in range(self.num_comments_in): Y_val.append(self.comment_in["correct or not"][n]) for r in range(self.num_comments_in): d = Y_val[r] if d == ' correct': Y_val[r] = 1 else: Y_val[r] = 0 XY_val = np.zeros((2,self.num_comments_in)) for n in range(self.num_comments_in): XY_val[0][n] = X_val[n] XY_val[1][n] = Y_val[n] #print(XY_val) count = 0 for n in range(self.num_comments_in): if XY_val[0][n] == XY_val[1][n]: count = count + 1 #print('Die Anzahl der richtigen Aussagen betraegt:', count) Proz = (count*100)/self.num_comments_in #print('Prozentual wurde richtig geraten:', Proz, '%') return XY_val , Proz #input: ['this is sentence a', 'and then comes the next sentence', 'and so on'], [ 'the ori sentence is'] def predictprobsOnSentenceList(self, beforeprocessingSentenceList, orisentence): oridocs = 0 if type(orisentence) == str: oridoc = self.nlp(orisentence) else: oridocs = 1 SentenceList =[] count = 0 for sentence in beforeprocessingSentenceList: count += 1 doc = self.nlp(sentence) if oridocs == 1: oridoc = self.nlp(orisentence[count - 1]) #if count % 100 == 0: #print(count) depssentence = [] tagssentence = [] for word in doc: for word2 in oridoc: if word.text == word2.text: depssentence.append(word2.dep_) break tagssentence.append(word.tag_) deps = ' ' tags = ' ' for x in depssentence: #print(x) deps += str(x) + ' ' tags += str(x) + ' ' #print('a',type(sentence)) #print('nb', type(deps)) processedsentence = '"' + sentence + deps + tags +'"' SentenceList.append(processedsentence) #print(SentenceList) #print( "Cleaning and parsing the valuation set comments...\n") clean_comments_in = [] numSentenceList = len(SentenceList) for i in range( 0, numSentenceList ): # If the index is evenly divisible by 1000, print a message #if( (i+1)%1000 == 0 ): #print("comment %d of %d\n" % ( i+1, numSentenceList )) #print(SentenceList[i]) clean_comments_in.append( comment_to_words( SentenceList[i] )) #print(clean_comments_in) comments_in_vector = self.vectorizer.transform(clean_comments_in) #print('da comments', comments_in_vector) # Numpy arrays are easy to work with, so convert the result to an # array self.comments_in_vector = comments_in_vector.toarray() Prob_perclass = self.clf.predict_proba(self.comments_in_vector) return Prob_perclass def GeneratePermutationsOfSentence(self, sentence): import itertools permutations = list(itertools.permutations(sentence)) output = [] for perm in permutations: output.append(list(perm)) return output def GetBestSentenceFromSentencesAccordingToGrammar(self, sentences, orisentence): probsMatrix = self.predictprobsOnSentenceList(sentences, orisentence) #print(probsMatrix) for i in range(len(probsMatrix)): probsMatrix[i][0] = i #print(probsMatrix) sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True) #print(sortedprobsMatrix) bestindex = sortedprobsMatrix[0][0] #print(bestindex) #print('probablemainsentences', filteredprobsentences) bestsentence = sentences[int(bestindex)] return bestsentence