You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

281 lines
7.9 KiB

4 years ago
  1. import pandas as pd
  2. import numpy as np
  3. from bs4 import BeautifulSoup
  4. import re
  5. import nltk
  6. from nltk.stem.snowball import SnowballStemmer
  7. from nltk.corpus import stopwords # Import the stop word list
  8. from sklearn.linear_model import SGDClassifier
  9. from sklearn import svm
  10. import scipy
  11. from sklearn import preprocessing
  12. from sklearn.externals import joblib
  13. def comment_to_words( raw_comment ):
  14. # Function to convert a raw comment to a string of words
  15. # The input is a single string (a raw comment), and
  16. # the output is a single string (a preprocessed comment)
  17. #
  18. # 1. Remove HTML
  19. comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
  20. #
  21. # 2. Remove non-letters
  22. #letters_only = re.sub("[^a-zA-Z]", " ", comment_text)
  23. #
  24. # 3. Convert to lower case, split into individual words
  25. words = comment_text.split()
  26. #words = letters_only.split()
  27. #
  28. # 4. In Python, searching a set is much faster than searching
  29. # a list, so convert the stop words to a set
  30. stops = set(stopwords.words("german"))
  31. #
  32. # 5. Remove stop words
  33. #meaningful_words = [w for w in words if not w in stops]
  34. meaningful_words = [w for w in words]
  35. #
  36. # 6. Join the words back into one string separated by space,
  37. # and return the result.
  38. return( " ".join( meaningful_words ))
  39. class SentGlueMach(object):
  40. def __init__(self, SGDModel, bagofwords):
  41. self.sgdmodel = SGDModel
  42. self.bow = bagofwords
  43. def initialize(self):
  44. from sklearn.feature_extraction.text import CountVectorizer
  45. #print('loading vectorizer..')
  46. self.vectorizer = joblib.load(self.bow)
  47. #print('done')
  48. #print('loading the SGD model..')
  49. self.clf = joblib.load(self.sgdmodel)
  50. #print('done')
  51. #print('loading spacy..')
  52. import spacy
  53. self.nlp = spacy.load('de_core_news_sm')
  54. #print('done')
  55. def predictandevalOnCsv(self, csvdata):
  56. self.comment_in = pd.read_csv(csvdata, header=0, quotechar='"',
  57. delimiter=',')
  58. #print('here ist the comment in format',self.comment_in)
  59. self.num_comments_in = self.comment_in["Kommentar"].size
  60. #print( "Cleaning and parsing the valuation set comments...\n")
  61. clean_comments_in = []
  62. for i in range( 0, self.num_comments_in ):
  63. # If the index is evenly divisible by 1000, print a message
  64. #if( (i+1)%1000 == 0 ):
  65. #print("comment %d of %d\n" % ( i+1, self.num_comments_in ))
  66. #print(self.comment_in["Kommentar"][i])
  67. clean_comments_in.append( comment_to_words( self.comment_in["Kommentar"][i] ))
  68. #print(clean_comments_in)
  69. comments_in_vector = self.vectorizer.transform(clean_comments_in)
  70. #print('da comments', comments_in_vector)
  71. # Numpy arrays are easy to work with, so convert the result to an
  72. # array
  73. self.comments_in_vector = comments_in_vector.toarray()
  74. #print(comments_in_vector)
  75. #print('here are the comments in vector, input to the predictmach', self.comments_in_vector)
  76. X_val = self.clf.predict(self.comments_in_vector)
  77. #print( X_val)
  78. Y_val = []
  79. for n in range(self.num_comments_in):
  80. Y_val.append(self.comment_in["correct or not"][n])
  81. for r in range(self.num_comments_in):
  82. d = Y_val[r]
  83. if d == ' correct':
  84. Y_val[r] = 1
  85. else:
  86. Y_val[r] = 0
  87. XY_val = np.zeros((2,self.num_comments_in))
  88. for n in range(self.num_comments_in):
  89. XY_val[0][n] = X_val[n]
  90. XY_val[1][n] = Y_val[n]
  91. #print(XY_val)
  92. count = 0
  93. for n in range(self.num_comments_in):
  94. if XY_val[0][n] == XY_val[1][n]:
  95. count = count + 1
  96. #print('Die Anzahl der richtigen Aussagen betraegt:', count)
  97. Proz = (count*100)/self.num_comments_in
  98. #print('Prozentual wurde richtig geraten:', Proz, '%')
  99. return XY_val , Proz
  100. #input: ['this is sentence a', 'and then comes the next sentence', 'and so on'], [ 'the ori sentence is']
  101. def predictprobsOnSentenceList(self, beforeprocessingSentenceList, orisentence):
  102. oridocs = 0
  103. if type(orisentence) == str:
  104. oridoc = self.nlp(orisentence)
  105. else:
  106. oridocs = 1
  107. SentenceList =[]
  108. count = 0
  109. for sentence in beforeprocessingSentenceList:
  110. count += 1
  111. doc = self.nlp(sentence)
  112. if oridocs == 1:
  113. oridoc = self.nlp(orisentence[count - 1])
  114. #if count % 100 == 0:
  115. #print(count)
  116. depssentence = []
  117. tagssentence = []
  118. for word in doc:
  119. for word2 in oridoc:
  120. if word.text == word2.text:
  121. depssentence.append(word2.dep_)
  122. break
  123. tagssentence.append(word.tag_)
  124. deps = ' '
  125. tags = ' '
  126. for x in depssentence:
  127. #print(x)
  128. deps += str(x) + ' '
  129. tags += str(x) + ' '
  130. #print('a',type(sentence))
  131. #print('nb', type(deps))
  132. processedsentence = '"' + sentence + deps + tags +'"'
  133. SentenceList.append(processedsentence)
  134. #print(SentenceList)
  135. #print( "Cleaning and parsing the valuation set comments...\n")
  136. clean_comments_in = []
  137. numSentenceList = len(SentenceList)
  138. for i in range( 0, numSentenceList ):
  139. # If the index is evenly divisible by 1000, print a message
  140. #if( (i+1)%1000 == 0 ):
  141. #print("comment %d of %d\n" % ( i+1, numSentenceList ))
  142. #print(SentenceList[i])
  143. clean_comments_in.append( comment_to_words( SentenceList[i] ))
  144. #print(clean_comments_in)
  145. comments_in_vector = self.vectorizer.transform(clean_comments_in)
  146. #print('da comments', comments_in_vector)
  147. # Numpy arrays are easy to work with, so convert the result to an
  148. # array
  149. self.comments_in_vector = comments_in_vector.toarray()
  150. Prob_perclass = self.clf.predict_proba(self.comments_in_vector)
  151. return Prob_perclass
  152. def GeneratePermutationsOfSentence(self, sentence):
  153. import itertools
  154. permutations = list(itertools.permutations(sentence))
  155. output = []
  156. for perm in permutations:
  157. output.append(list(perm))
  158. return output
  159. def GetBestSentenceFromSentencesAccordingToGrammar(self, sentences, orisentence):
  160. probsMatrix = self.predictprobsOnSentenceList(sentences, orisentence)
  161. #print(probsMatrix)
  162. for i in range(len(probsMatrix)):
  163. probsMatrix[i][0] = i
  164. #print(probsMatrix)
  165. sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
  166. #print(sortedprobsMatrix)
  167. bestindex = sortedprobsMatrix[0][0]
  168. #print(bestindex)
  169. #print('probablemainsentences', filteredprobsentences)
  170. bestsentence = sentences[int(bestindex)]
  171. return bestsentence