basabuuka_prototyp/Prototyp/SentGlue.py
2020-08-16 19:36:44 +02:00

281 lines
7.9 KiB
Python

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # Import the stop word list
from sklearn.linear_model import SGDClassifier
from sklearn import svm
import scipy
from sklearn import preprocessing
from sklearn.externals import joblib
def comment_to_words( raw_comment ):
# Function to convert a raw comment to a string of words
# The input is a single string (a raw comment), and
# the output is a single string (a preprocessed comment)
#
# 1. Remove HTML
comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
#
# 2. Remove non-letters
#letters_only = re.sub("[^a-zA-Z]", " ", comment_text)
#
# 3. Convert to lower case, split into individual words
words = comment_text.split()
#words = letters_only.split()
#
# 4. In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
stops = set(stopwords.words("german"))
#
# 5. Remove stop words
#meaningful_words = [w for w in words if not w in stops]
meaningful_words = [w for w in words]
#
# 6. Join the words back into one string separated by space,
# and return the result.
return( " ".join( meaningful_words ))
class SentGlueMach(object):
def __init__(self, SGDModel, bagofwords):
self.sgdmodel = SGDModel
self.bow = bagofwords
def initialize(self):
from sklearn.feature_extraction.text import CountVectorizer
#print('loading vectorizer..')
self.vectorizer = joblib.load(self.bow)
#print('done')
#print('loading the SGD model..')
self.clf = joblib.load(self.sgdmodel)
#print('done')
#print('loading spacy..')
import spacy
self.nlp = spacy.load('de_core_news_sm')
#print('done')
def predictandevalOnCsv(self, csvdata):
self.comment_in = pd.read_csv(csvdata, header=0, quotechar='"',
delimiter=',')
#print('here ist the comment in format',self.comment_in)
self.num_comments_in = self.comment_in["Kommentar"].size
#print( "Cleaning and parsing the valuation set comments...\n")
clean_comments_in = []
for i in range( 0, self.num_comments_in ):
# If the index is evenly divisible by 1000, print a message
#if( (i+1)%1000 == 0 ):
#print("comment %d of %d\n" % ( i+1, self.num_comments_in ))
#print(self.comment_in["Kommentar"][i])
clean_comments_in.append( comment_to_words( self.comment_in["Kommentar"][i] ))
#print(clean_comments_in)
comments_in_vector = self.vectorizer.transform(clean_comments_in)
#print('da comments', comments_in_vector)
# Numpy arrays are easy to work with, so convert the result to an
# array
self.comments_in_vector = comments_in_vector.toarray()
#print(comments_in_vector)
#print('here are the comments in vector, input to the predictmach', self.comments_in_vector)
X_val = self.clf.predict(self.comments_in_vector)
#print( X_val)
Y_val = []
for n in range(self.num_comments_in):
Y_val.append(self.comment_in["correct or not"][n])
for r in range(self.num_comments_in):
d = Y_val[r]
if d == ' correct':
Y_val[r] = 1
else:
Y_val[r] = 0
XY_val = np.zeros((2,self.num_comments_in))
for n in range(self.num_comments_in):
XY_val[0][n] = X_val[n]
XY_val[1][n] = Y_val[n]
#print(XY_val)
count = 0
for n in range(self.num_comments_in):
if XY_val[0][n] == XY_val[1][n]:
count = count + 1
#print('Die Anzahl der richtigen Aussagen betraegt:', count)
Proz = (count*100)/self.num_comments_in
#print('Prozentual wurde richtig geraten:', Proz, '%')
return XY_val , Proz
#input: ['this is sentence a', 'and then comes the next sentence', 'and so on'], [ 'the ori sentence is']
def predictprobsOnSentenceList(self, beforeprocessingSentenceList, orisentence):
oridocs = 0
if type(orisentence) == str:
oridoc = self.nlp(orisentence)
else:
oridocs = 1
SentenceList =[]
count = 0
for sentence in beforeprocessingSentenceList:
count += 1
doc = self.nlp(sentence)
if oridocs == 1:
oridoc = self.nlp(orisentence[count - 1])
#if count % 100 == 0:
#print(count)
depssentence = []
tagssentence = []
for word in doc:
for word2 in oridoc:
if word.text == word2.text:
depssentence.append(word2.dep_)
break
tagssentence.append(word.tag_)
deps = ' '
tags = ' '
for x in depssentence:
#print(x)
deps += str(x) + ' '
tags += str(x) + ' '
#print('a',type(sentence))
#print('nb', type(deps))
processedsentence = '"' + sentence + deps + tags +'"'
SentenceList.append(processedsentence)
#print(SentenceList)
#print( "Cleaning and parsing the valuation set comments...\n")
clean_comments_in = []
numSentenceList = len(SentenceList)
for i in range( 0, numSentenceList ):
# If the index is evenly divisible by 1000, print a message
#if( (i+1)%1000 == 0 ):
#print("comment %d of %d\n" % ( i+1, numSentenceList ))
#print(SentenceList[i])
clean_comments_in.append( comment_to_words( SentenceList[i] ))
#print(clean_comments_in)
comments_in_vector = self.vectorizer.transform(clean_comments_in)
#print('da comments', comments_in_vector)
# Numpy arrays are easy to work with, so convert the result to an
# array
self.comments_in_vector = comments_in_vector.toarray()
Prob_perclass = self.clf.predict_proba(self.comments_in_vector)
return Prob_perclass
def GeneratePermutationsOfSentence(self, sentence):
import itertools
permutations = list(itertools.permutations(sentence))
output = []
for perm in permutations:
output.append(list(perm))
return output
def GetBestSentenceFromSentencesAccordingToGrammar(self, sentences, orisentence):
probsMatrix = self.predictprobsOnSentenceList(sentences, orisentence)
#print(probsMatrix)
for i in range(len(probsMatrix)):
probsMatrix[i][0] = i
#print(probsMatrix)
sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
#print(sortedprobsMatrix)
bestindex = sortedprobsMatrix[0][0]
#print(bestindex)
#print('probablemainsentences', filteredprobsentences)
bestsentence = sentences[int(bestindex)]
return bestsentence