You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

266 lines
7.0 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. # Import the pandas package, then use the "read_csv" function to read
  2. # the labeled training data
  3. import numpy as np
  4. import pandas as pd
  5. #from bs4 import BeautifulSoup
  6. import re
  7. #import nltk
  8. from nltk.stem.snowball import SnowballStemmer
  9. #from nltk.corpus import stopwords # Import the stop word list
  10. import hickle as hkl
  11. from sklearn.linear_model import SGDClassifier
  12. from sklearn import svm
  13. import scipy
  14. from sklearn import preprocessing
  15. import sys
  16. import os
  17. from gensim.models import word2vec
  18. import logging
  19. import gensim
  20. import _pickle as cPickle
  21. # Get the data directories
  22. #directoryIn = sys.argv[1]
  23. #directoryTrans = sys.argv[2]
  24. #directoryOut = sys.argv[3]
  25. #cwd = os.getcwd()
  26. #rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
  27. #print('writing every document as one line in a textfile ')
  28. #for rechtsprech in rechtsprechIn:
  29. #with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
  30. #with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:
  31. #print(Indok)
  32. #lines = []
  33. #for line in Indok:
  34. #lines += [str(line)[:-1]]
  35. #print(lines)
  36. #Transdok.write(' '.join(lines))
  37. ##print([lin])
  38. ##print([str(line)[:-1]])
  39. ##print(lines)
  40. #Transdok.write('\n')
  41. def dok_to_words( raw_comment ):
  42. # Function to convert a raw comment to a string of words
  43. # The input is a single string (a raw comment), and
  44. # the output is a single string (a preprocessed comment)
  45. #
  46. # 1. Remove HTML
  47. #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
  48. #
  49. # 2. Remove non-letters
  50. letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment)
  51. #
  52. # 3. Convert to lower case, split into individual words
  53. words = letters_only.split()
  54. #print('words', words)
  55. #
  56. # 4. In Python, searching a set is much faster than searching
  57. # a list, so convert the stop words to a set
  58. #stops = set(stopwords.words("german"))
  59. #
  60. # 5. Remove stop words
  61. #meaningful_words = [w for w in words if not w in stops]
  62. meaningful_words = [w for w in words]
  63. #
  64. # 6. Join the words back into one string separated by space,
  65. # and return the result.
  66. return( " ".join( meaningful_words ))
  67. ##########################################################################
  68. #Initialize stemme r:
  69. stemmer = SnowballStemmer("german")
  70. print('loading model...')
  71. model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True) # C binary format
  72. print('done')
  73. logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  74. directoryIn = sys.argv[1]
  75. cwd = os.getcwd()
  76. rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
  77. try:
  78. hkl.dump(rechtsprechIn, 'rechtsprechIn_gzip.hkl', mode='w', compression='gzip')
  79. except MemoryError:
  80. print('There was a memoryerror regarding the hdf5 dump, saving the directory as a table')
  81. pass
  82. counter = 0
  83. train = []
  84. print('writing every document as one line in a textfile ')
  85. for rechtsprech in rechtsprechIn:
  86. train_spacy = []
  87. with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
  88. lines = []
  89. for line in Indok:
  90. if len(str(line)[:-1]) >= 3:
  91. #print(n)
  92. word = dok_to_words(str(line)[:-1])
  93. train_spacy.append(word)
  94. train_spacy.append(stemmer.stem(word))
  95. synonyms = []
  96. try:
  97. momo = model.similar_by_word(word, topn=9)
  98. for element in momo:
  99. synonyms.append(element[0])
  100. #print(synonyms)
  101. #print(type(synonyms))
  102. train_spacy += [stemmer.stem(synonym) for synonym in synonyms]
  103. except KeyError:
  104. print('the word ' + word +' was not in the vocab')
  105. pass
  106. #print(synonyms , '\n')
  107. #print(word , '\n')
  108. #.append(word)
  109. #lines += [model.similar_by_word(str(line)[:-1])]
  110. counter += 1
  111. print(counter)
  112. setofwords = set(train_spacy)
  113. new_train_spacy = list(setofwords)
  114. train.append(' '.join(new_train_spacy))
  115. #with open('30Out/data', 'a') as doc:
  116. # doc.write(str(train))
  117. #print('oi', train ,'oi ')
  118. #print(len(train))
  119. #print(len(train[1]))
  120. #momo = model.similar_by_word("Computer", topn=20)
  121. #twenty = []
  122. #for element in momo:
  123. #twenty.append(element[0])
  124. #print(twenty)
  125. #Initialize training data:
  126. #train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')
  127. #df = pd.DataFrame(train)
  128. #print(df)
  129. #print(df.shape)
  130. #num_doks = train.size
  131. #print(num_doks)
  132. # Print the raw comment and then the output of get_text(), for
  133. # comparison
  134. #print('erste zeile',df[0].iloc[1])
  135. #print("Cleaning and parsing the training set comments...\n")
  136. #clean_train_doks = []
  137. #for i in range( 0, num_doks ):
  138. ## If the index is evenly divisible by 1000, print a message
  139. #if( (i+1)%1000 == 0 ):
  140. #print("comment %d of %d\n" % ( i+1, num_doks ))
  141. #clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))
  142. #print(clean_train_doks)
  143. print("Creating the bag of words...\n")
  144. from sklearn.feature_extraction.text import CountVectorizer
  145. # Initialize the "CountVectorizer" object, which is scikit-learn's
  146. # bag of words tool.
  147. vectorizer = CountVectorizer(analyzer = "word", \
  148. tokenizer = None, \
  149. preprocessor = None, \
  150. stop_words = None, \
  151. max_features = 20000)
  152. # fit_transform() does two functions: First, it fits the model
  153. # and learns the vocabulary; second, it transforms our training data
  154. # into feature vectors. The input to fit_transform should be a list of
  155. # strings.
  156. train_data_features = vectorizer.fit_transform(train)
  157. from sklearn.externals import joblib
  158. joblib.dump(vectorizer, 'bagofwords.pkl')
  159. # Numpy arrays are easy to work with, so convert the result to an
  160. # array
  161. train_data_features = train_data_features.toarray()
  162. try:
  163. hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip')
  164. except MemoryError:
  165. print('There was a memoryerror regarding the hdf5 dump')
  166. pass
  167. try:
  168. file_numpy = open('OnesZerosDB.npy', 'wb')
  169. np.save(file_numpy, train_data_features)
  170. except Exception as e:
  171. print(traceback.format_exception(*sys.exc_info()))
  172. raise # reraises the exception
  173. try:
  174. file_pi = open('OnesZerosDB.bin', 'wb')
  175. cPickle.dump(train_data_features, file_pi)
  176. except MemoryError:
  177. print('There was a memorerror regarding the cpickle dump')
  178. pass
  179. #print(len(train_data_features))
  180. #for m in train_data_features[1]:
  181. #print(m)