|
|
-
-
- import pandas as pd
- import numpy as np
-
-
- from bs4 import BeautifulSoup
- import re
-
- import nltk
-
- from nltk.stem.snowball import SnowballStemmer
- from nltk.corpus import stopwords # Import the stop word list
-
- from sklearn.linear_model import SGDClassifier
- from sklearn import svm
- import scipy
- from sklearn import preprocessing
-
- from sklearn.externals import joblib
-
- def comment_to_words( raw_comment ):
- # Function to convert a raw comment to a string of words
- # The input is a single string (a raw comment), and
- # the output is a single string (a preprocessed comment)
- #
- # 1. Remove HTML
- comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
- #
- # 2. Remove non-letters
- #letters_only = re.sub("[^a-zA-Z]", " ", comment_text)
- #
- # 3. Convert to lower case, split into individual words
- words = comment_text.split()
- #words = letters_only.split()
- #
- # 4. In Python, searching a set is much faster than searching
- # a list, so convert the stop words to a set
- stops = set(stopwords.words("german"))
- #
- # 5. Remove stop words
- #meaningful_words = [w for w in words if not w in stops]
- meaningful_words = [w for w in words]
- #
- # 6. Join the words back into one string separated by space,
- # and return the result.
- return( " ".join( meaningful_words ))
-
-
-
-
- class SentGlueMach(object):
-
- def __init__(self, SGDModel, bagofwords):
-
- self.sgdmodel = SGDModel
- self.bow = bagofwords
-
-
- def initialize(self):
-
-
- from sklearn.feature_extraction.text import CountVectorizer
-
- #print('loading vectorizer..')
- self.vectorizer = joblib.load(self.bow)
- #print('done')
-
-
- #print('loading the SGD model..')
- self.clf = joblib.load(self.sgdmodel)
- #print('done')
-
- #print('loading spacy..')
- import spacy
- self.nlp = spacy.load('de_core_news_sm')
- #print('done')
-
-
-
-
-
-
-
-
- def predictandevalOnCsv(self, csvdata):
-
- self.comment_in = pd.read_csv(csvdata, header=0, quotechar='"',
- delimiter=',')
-
- #print('here ist the comment in format',self.comment_in)
-
- self.num_comments_in = self.comment_in["Kommentar"].size
-
-
-
- #print( "Cleaning and parsing the valuation set comments...\n")
- clean_comments_in = []
- for i in range( 0, self.num_comments_in ):
- # If the index is evenly divisible by 1000, print a message
- #if( (i+1)%1000 == 0 ):
- #print("comment %d of %d\n" % ( i+1, self.num_comments_in ))
- #print(self.comment_in["Kommentar"][i])
- clean_comments_in.append( comment_to_words( self.comment_in["Kommentar"][i] ))
-
-
- #print(clean_comments_in)
-
- comments_in_vector = self.vectorizer.transform(clean_comments_in)
-
- #print('da comments', comments_in_vector)
-
- # Numpy arrays are easy to work with, so convert the result to an
- # array
- self.comments_in_vector = comments_in_vector.toarray()
- #print(comments_in_vector)
- #print('here are the comments in vector, input to the predictmach', self.comments_in_vector)
-
-
- X_val = self.clf.predict(self.comments_in_vector)
-
-
-
- #print( X_val)
-
-
- Y_val = []
- for n in range(self.num_comments_in):
- Y_val.append(self.comment_in["correct or not"][n])
-
- for r in range(self.num_comments_in):
- d = Y_val[r]
- if d == ' correct':
- Y_val[r] = 1
- else:
- Y_val[r] = 0
-
-
- XY_val = np.zeros((2,self.num_comments_in))
-
- for n in range(self.num_comments_in):
- XY_val[0][n] = X_val[n]
- XY_val[1][n] = Y_val[n]
-
- #print(XY_val)
-
- count = 0
- for n in range(self.num_comments_in):
- if XY_val[0][n] == XY_val[1][n]:
- count = count + 1
-
-
- #print('Die Anzahl der richtigen Aussagen betraegt:', count)
-
- Proz = (count*100)/self.num_comments_in
-
- #print('Prozentual wurde richtig geraten:', Proz, '%')
-
-
- return XY_val , Proz
-
-
- #input: ['this is sentence a', 'and then comes the next sentence', 'and so on'], [ 'the ori sentence is']
-
- def predictprobsOnSentenceList(self, beforeprocessingSentenceList, orisentence):
-
- oridocs = 0
-
- if type(orisentence) == str:
-
- oridoc = self.nlp(orisentence)
- else:
- oridocs = 1
-
- SentenceList =[]
- count = 0
- for sentence in beforeprocessingSentenceList:
-
-
-
- count += 1
- doc = self.nlp(sentence)
-
- if oridocs == 1:
-
- oridoc = self.nlp(orisentence[count - 1])
-
- #if count % 100 == 0:
- #print(count)
- depssentence = []
- tagssentence = []
- for word in doc:
- for word2 in oridoc:
- if word.text == word2.text:
- depssentence.append(word2.dep_)
- break
-
- tagssentence.append(word.tag_)
- deps = ' '
- tags = ' '
- for x in depssentence:
- #print(x)
- deps += str(x) + ' '
- tags += str(x) + ' '
-
- #print('a',type(sentence))
- #print('nb', type(deps))
-
- processedsentence = '"' + sentence + deps + tags +'"'
-
- SentenceList.append(processedsentence)
-
- #print(SentenceList)
- #print( "Cleaning and parsing the valuation set comments...\n")
- clean_comments_in = []
- numSentenceList = len(SentenceList)
- for i in range( 0, numSentenceList ):
- # If the index is evenly divisible by 1000, print a message
- #if( (i+1)%1000 == 0 ):
- #print("comment %d of %d\n" % ( i+1, numSentenceList ))
- #print(SentenceList[i])
- clean_comments_in.append( comment_to_words( SentenceList[i] ))
-
-
- #print(clean_comments_in)
-
- comments_in_vector = self.vectorizer.transform(clean_comments_in)
-
- #print('da comments', comments_in_vector)
-
- # Numpy arrays are easy to work with, so convert the result to an
- # array
- self.comments_in_vector = comments_in_vector.toarray()
-
-
-
- Prob_perclass = self.clf.predict_proba(self.comments_in_vector)
-
- return Prob_perclass
-
-
-
-
- def GeneratePermutationsOfSentence(self, sentence):
-
- import itertools
- permutations = list(itertools.permutations(sentence))
-
- output = []
- for perm in permutations:
- output.append(list(perm))
-
- return output
-
- def GetBestSentenceFromSentencesAccordingToGrammar(self, sentences, orisentence):
-
-
-
-
- probsMatrix = self.predictprobsOnSentenceList(sentences, orisentence)
-
- #print(probsMatrix)
-
- for i in range(len(probsMatrix)):
- probsMatrix[i][0] = i
-
- #print(probsMatrix)
-
- sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
-
- #print(sortedprobsMatrix)
-
- bestindex = sortedprobsMatrix[0][0]
-
- #print(bestindex)
- #print('probablemainsentences', filteredprobsentences)
- bestsentence = sentences[int(bestindex)]
-
-
- return bestsentence
-
|