282 lines
7.9 KiB
Python
282 lines
7.9 KiB
Python
|
|
||
|
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
|
||
|
|
||
|
from bs4 import BeautifulSoup
|
||
|
import re
|
||
|
|
||
|
import nltk
|
||
|
|
||
|
from nltk.stem.snowball import SnowballStemmer
|
||
|
from nltk.corpus import stopwords # Import the stop word list
|
||
|
|
||
|
from sklearn.linear_model import SGDClassifier
|
||
|
from sklearn import svm
|
||
|
import scipy
|
||
|
from sklearn import preprocessing
|
||
|
|
||
|
from sklearn.externals import joblib
|
||
|
|
||
|
def comment_to_words( raw_comment ):
|
||
|
# Function to convert a raw comment to a string of words
|
||
|
# The input is a single string (a raw comment), and
|
||
|
# the output is a single string (a preprocessed comment)
|
||
|
#
|
||
|
# 1. Remove HTML
|
||
|
comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
|
||
|
#
|
||
|
# 2. Remove non-letters
|
||
|
#letters_only = re.sub("[^a-zA-Z]", " ", comment_text)
|
||
|
#
|
||
|
# 3. Convert to lower case, split into individual words
|
||
|
words = comment_text.split()
|
||
|
#words = letters_only.split()
|
||
|
#
|
||
|
# 4. In Python, searching a set is much faster than searching
|
||
|
# a list, so convert the stop words to a set
|
||
|
stops = set(stopwords.words("german"))
|
||
|
#
|
||
|
# 5. Remove stop words
|
||
|
#meaningful_words = [w for w in words if not w in stops]
|
||
|
meaningful_words = [w for w in words]
|
||
|
#
|
||
|
# 6. Join the words back into one string separated by space,
|
||
|
# and return the result.
|
||
|
return( " ".join( meaningful_words ))
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
class SentGlueMach(object):
|
||
|
|
||
|
def __init__(self, SGDModel, bagofwords):
|
||
|
|
||
|
self.sgdmodel = SGDModel
|
||
|
self.bow = bagofwords
|
||
|
|
||
|
|
||
|
def initialize(self):
|
||
|
|
||
|
|
||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||
|
|
||
|
#print('loading vectorizer..')
|
||
|
self.vectorizer = joblib.load(self.bow)
|
||
|
#print('done')
|
||
|
|
||
|
|
||
|
#print('loading the SGD model..')
|
||
|
self.clf = joblib.load(self.sgdmodel)
|
||
|
#print('done')
|
||
|
|
||
|
#print('loading spacy..')
|
||
|
import spacy
|
||
|
self.nlp = spacy.load('de_core_news_sm')
|
||
|
#print('done')
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def predictandevalOnCsv(self, csvdata):
|
||
|
|
||
|
self.comment_in = pd.read_csv(csvdata, header=0, quotechar='"',
|
||
|
delimiter=',')
|
||
|
|
||
|
#print('here ist the comment in format',self.comment_in)
|
||
|
|
||
|
self.num_comments_in = self.comment_in["Kommentar"].size
|
||
|
|
||
|
|
||
|
|
||
|
#print( "Cleaning and parsing the valuation set comments...\n")
|
||
|
clean_comments_in = []
|
||
|
for i in range( 0, self.num_comments_in ):
|
||
|
# If the index is evenly divisible by 1000, print a message
|
||
|
#if( (i+1)%1000 == 0 ):
|
||
|
#print("comment %d of %d\n" % ( i+1, self.num_comments_in ))
|
||
|
#print(self.comment_in["Kommentar"][i])
|
||
|
clean_comments_in.append( comment_to_words( self.comment_in["Kommentar"][i] ))
|
||
|
|
||
|
|
||
|
#print(clean_comments_in)
|
||
|
|
||
|
comments_in_vector = self.vectorizer.transform(clean_comments_in)
|
||
|
|
||
|
#print('da comments', comments_in_vector)
|
||
|
|
||
|
# Numpy arrays are easy to work with, so convert the result to an
|
||
|
# array
|
||
|
self.comments_in_vector = comments_in_vector.toarray()
|
||
|
#print(comments_in_vector)
|
||
|
#print('here are the comments in vector, input to the predictmach', self.comments_in_vector)
|
||
|
|
||
|
|
||
|
X_val = self.clf.predict(self.comments_in_vector)
|
||
|
|
||
|
|
||
|
|
||
|
#print( X_val)
|
||
|
|
||
|
|
||
|
Y_val = []
|
||
|
for n in range(self.num_comments_in):
|
||
|
Y_val.append(self.comment_in["correct or not"][n])
|
||
|
|
||
|
for r in range(self.num_comments_in):
|
||
|
d = Y_val[r]
|
||
|
if d == ' correct':
|
||
|
Y_val[r] = 1
|
||
|
else:
|
||
|
Y_val[r] = 0
|
||
|
|
||
|
|
||
|
XY_val = np.zeros((2,self.num_comments_in))
|
||
|
|
||
|
for n in range(self.num_comments_in):
|
||
|
XY_val[0][n] = X_val[n]
|
||
|
XY_val[1][n] = Y_val[n]
|
||
|
|
||
|
#print(XY_val)
|
||
|
|
||
|
count = 0
|
||
|
for n in range(self.num_comments_in):
|
||
|
if XY_val[0][n] == XY_val[1][n]:
|
||
|
count = count + 1
|
||
|
|
||
|
|
||
|
#print('Die Anzahl der richtigen Aussagen betraegt:', count)
|
||
|
|
||
|
Proz = (count*100)/self.num_comments_in
|
||
|
|
||
|
#print('Prozentual wurde richtig geraten:', Proz, '%')
|
||
|
|
||
|
|
||
|
return XY_val , Proz
|
||
|
|
||
|
|
||
|
#input: ['this is sentence a', 'and then comes the next sentence', 'and so on'], [ 'the ori sentence is']
|
||
|
|
||
|
def predictprobsOnSentenceList(self, beforeprocessingSentenceList, orisentence):
|
||
|
|
||
|
oridocs = 0
|
||
|
|
||
|
if type(orisentence) == str:
|
||
|
|
||
|
oridoc = self.nlp(orisentence)
|
||
|
else:
|
||
|
oridocs = 1
|
||
|
|
||
|
SentenceList =[]
|
||
|
count = 0
|
||
|
for sentence in beforeprocessingSentenceList:
|
||
|
|
||
|
|
||
|
|
||
|
count += 1
|
||
|
doc = self.nlp(sentence)
|
||
|
|
||
|
if oridocs == 1:
|
||
|
|
||
|
oridoc = self.nlp(orisentence[count - 1])
|
||
|
|
||
|
#if count % 100 == 0:
|
||
|
#print(count)
|
||
|
depssentence = []
|
||
|
tagssentence = []
|
||
|
for word in doc:
|
||
|
for word2 in oridoc:
|
||
|
if word.text == word2.text:
|
||
|
depssentence.append(word2.dep_)
|
||
|
break
|
||
|
|
||
|
tagssentence.append(word.tag_)
|
||
|
deps = ' '
|
||
|
tags = ' '
|
||
|
for x in depssentence:
|
||
|
#print(x)
|
||
|
deps += str(x) + ' '
|
||
|
tags += str(x) + ' '
|
||
|
|
||
|
#print('a',type(sentence))
|
||
|
#print('nb', type(deps))
|
||
|
|
||
|
processedsentence = '"' + sentence + deps + tags +'"'
|
||
|
|
||
|
SentenceList.append(processedsentence)
|
||
|
|
||
|
#print(SentenceList)
|
||
|
#print( "Cleaning and parsing the valuation set comments...\n")
|
||
|
clean_comments_in = []
|
||
|
numSentenceList = len(SentenceList)
|
||
|
for i in range( 0, numSentenceList ):
|
||
|
# If the index is evenly divisible by 1000, print a message
|
||
|
#if( (i+1)%1000 == 0 ):
|
||
|
#print("comment %d of %d\n" % ( i+1, numSentenceList ))
|
||
|
#print(SentenceList[i])
|
||
|
clean_comments_in.append( comment_to_words( SentenceList[i] ))
|
||
|
|
||
|
|
||
|
#print(clean_comments_in)
|
||
|
|
||
|
comments_in_vector = self.vectorizer.transform(clean_comments_in)
|
||
|
|
||
|
#print('da comments', comments_in_vector)
|
||
|
|
||
|
# Numpy arrays are easy to work with, so convert the result to an
|
||
|
# array
|
||
|
self.comments_in_vector = comments_in_vector.toarray()
|
||
|
|
||
|
|
||
|
|
||
|
Prob_perclass = self.clf.predict_proba(self.comments_in_vector)
|
||
|
|
||
|
return Prob_perclass
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def GeneratePermutationsOfSentence(self, sentence):
|
||
|
|
||
|
import itertools
|
||
|
permutations = list(itertools.permutations(sentence))
|
||
|
|
||
|
output = []
|
||
|
for perm in permutations:
|
||
|
output.append(list(perm))
|
||
|
|
||
|
return output
|
||
|
|
||
|
def GetBestSentenceFromSentencesAccordingToGrammar(self, sentences, orisentence):
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
probsMatrix = self.predictprobsOnSentenceList(sentences, orisentence)
|
||
|
|
||
|
#print(probsMatrix)
|
||
|
|
||
|
for i in range(len(probsMatrix)):
|
||
|
probsMatrix[i][0] = i
|
||
|
|
||
|
#print(probsMatrix)
|
||
|
|
||
|
sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
|
||
|
|
||
|
#print(sortedprobsMatrix)
|
||
|
|
||
|
bestindex = sortedprobsMatrix[0][0]
|
||
|
|
||
|
#print(bestindex)
|
||
|
#print('probablemainsentences', filteredprobsentences)
|
||
|
bestsentence = sentences[int(bestindex)]
|
||
|
|
||
|
|
||
|
return bestsentence
|
||
|
|