You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

281 lines
7.9 KiB

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # Import the stop word list
from sklearn.linear_model import SGDClassifier
from sklearn import svm
import scipy
from sklearn import preprocessing
from sklearn.externals import joblib
def comment_to_words( raw_comment ):
# Function to convert a raw comment to a string of words
# The input is a single string (a raw comment), and
# the output is a single string (a preprocessed comment)
#
# 1. Remove HTML
comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
#
# 2. Remove non-letters
#letters_only = re.sub("[^a-zA-Z]", " ", comment_text)
#
# 3. Convert to lower case, split into individual words
words = comment_text.split()
#words = letters_only.split()
#
# 4. In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
stops = set(stopwords.words("german"))
#
# 5. Remove stop words
#meaningful_words = [w for w in words if not w in stops]
meaningful_words = [w for w in words]
#
# 6. Join the words back into one string separated by space,
# and return the result.
return( " ".join( meaningful_words ))
class SentGlueMach(object):
def __init__(self, SGDModel, bagofwords):
self.sgdmodel = SGDModel
self.bow = bagofwords
def initialize(self):
from sklearn.feature_extraction.text import CountVectorizer
#print('loading vectorizer..')
self.vectorizer = joblib.load(self.bow)
#print('done')
#print('loading the SGD model..')
self.clf = joblib.load(self.sgdmodel)
#print('done')
#print('loading spacy..')
import spacy
self.nlp = spacy.load('de_core_news_sm')
#print('done')
def predictandevalOnCsv(self, csvdata):
self.comment_in = pd.read_csv(csvdata, header=0, quotechar='"',
delimiter=',')
#print('here ist the comment in format',self.comment_in)
self.num_comments_in = self.comment_in["Kommentar"].size
#print( "Cleaning and parsing the valuation set comments...\n")
clean_comments_in = []
for i in range( 0, self.num_comments_in ):
# If the index is evenly divisible by 1000, print a message
#if( (i+1)%1000 == 0 ):
#print("comment %d of %d\n" % ( i+1, self.num_comments_in ))
#print(self.comment_in["Kommentar"][i])
clean_comments_in.append( comment_to_words( self.comment_in["Kommentar"][i] ))
#print(clean_comments_in)
comments_in_vector = self.vectorizer.transform(clean_comments_in)
#print('da comments', comments_in_vector)
# Numpy arrays are easy to work with, so convert the result to an
# array
self.comments_in_vector = comments_in_vector.toarray()
#print(comments_in_vector)
#print('here are the comments in vector, input to the predictmach', self.comments_in_vector)
X_val = self.clf.predict(self.comments_in_vector)
#print( X_val)
Y_val = []
for n in range(self.num_comments_in):
Y_val.append(self.comment_in["correct or not"][n])
for r in range(self.num_comments_in):
d = Y_val[r]
if d == ' correct':
Y_val[r] = 1
else:
Y_val[r] = 0
XY_val = np.zeros((2,self.num_comments_in))
for n in range(self.num_comments_in):
XY_val[0][n] = X_val[n]
XY_val[1][n] = Y_val[n]
#print(XY_val)
count = 0
for n in range(self.num_comments_in):
if XY_val[0][n] == XY_val[1][n]:
count = count + 1
#print('Die Anzahl der richtigen Aussagen betraegt:', count)
Proz = (count*100)/self.num_comments_in
#print('Prozentual wurde richtig geraten:', Proz, '%')
return XY_val , Proz
#input: ['this is sentence a', 'and then comes the next sentence', 'and so on'], [ 'the ori sentence is']
def predictprobsOnSentenceList(self, beforeprocessingSentenceList, orisentence):
oridocs = 0
if type(orisentence) == str:
oridoc = self.nlp(orisentence)
else:
oridocs = 1
SentenceList =[]
count = 0
for sentence in beforeprocessingSentenceList:
count += 1
doc = self.nlp(sentence)
if oridocs == 1:
oridoc = self.nlp(orisentence[count - 1])
#if count % 100 == 0:
#print(count)
depssentence = []
tagssentence = []
for word in doc:
for word2 in oridoc:
if word.text == word2.text:
depssentence.append(word2.dep_)
break
tagssentence.append(word.tag_)
deps = ' '
tags = ' '
for x in depssentence:
#print(x)
deps += str(x) + ' '
tags += str(x) + ' '
#print('a',type(sentence))
#print('nb', type(deps))
processedsentence = '"' + sentence + deps + tags +'"'
SentenceList.append(processedsentence)
#print(SentenceList)
#print( "Cleaning and parsing the valuation set comments...\n")
clean_comments_in = []
numSentenceList = len(SentenceList)
for i in range( 0, numSentenceList ):
# If the index is evenly divisible by 1000, print a message
#if( (i+1)%1000 == 0 ):
#print("comment %d of %d\n" % ( i+1, numSentenceList ))
#print(SentenceList[i])
clean_comments_in.append( comment_to_words( SentenceList[i] ))
#print(clean_comments_in)
comments_in_vector = self.vectorizer.transform(clean_comments_in)
#print('da comments', comments_in_vector)
# Numpy arrays are easy to work with, so convert the result to an
# array
self.comments_in_vector = comments_in_vector.toarray()
Prob_perclass = self.clf.predict_proba(self.comments_in_vector)
return Prob_perclass
def GeneratePermutationsOfSentence(self, sentence):
import itertools
permutations = list(itertools.permutations(sentence))
output = []
for perm in permutations:
output.append(list(perm))
return output
def GetBestSentenceFromSentencesAccordingToGrammar(self, sentences, orisentence):
probsMatrix = self.predictprobsOnSentenceList(sentences, orisentence)
#print(probsMatrix)
for i in range(len(probsMatrix)):
probsMatrix[i][0] = i
#print(probsMatrix)
sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
#print(sortedprobsMatrix)
bestindex = sortedprobsMatrix[0][0]
#print(bestindex)
#print('probablemainsentences', filteredprobsentences)
bestsentence = sentences[int(bestindex)]
return bestsentence