# Import the pandas package, then use the "read_csv" function to read # the labeled training data import numpy as np import pandas as pd #from bs4 import BeautifulSoup import re import nltk from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords # Import the stop word list from sklearn.linear_model import SGDClassifier from sklearn import svm import scipy from sklearn import preprocessing import sys import os from gensim.models import word2vec import logging import gensim import _pickle as cPickle # Get the data directories #directoryIn = sys.argv[1] #directoryTrans = sys.argv[2] #directoryOut = sys.argv[3] #cwd = os.getcwd() #rechtsprechIn = os.listdir(cwd + '/' + directoryIn) #print('writing every document as one line in a textfile ') #for rechtsprech in rechtsprechIn: #with open(cwd + '/' + directoryIn + rechtsprech) as Indok: #with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok: #print(Indok) #lines = [] #for line in Indok: #lines += [str(line)[:-1]] #print(lines) #Transdok.write(' '.join(lines)) ##print([lin]) ##print([str(line)[:-1]]) ##print(lines) #Transdok.write('\n') def dok_to_words( raw_comment ): # Function to convert a raw comment to a string of words # The input is a single string (a raw comment), and # the output is a single string (a preprocessed comment) # # 1. Remove HTML #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text() # # 2. Remove non-letters letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment) # # 3. Convert to lower case, split into individual words words = letters_only.lower().split() print('words', words) # # 4. In Python, searching a set is much faster than searching # a list, so convert the stop words to a set #stops = set(stopwords.words("german")) # # 5. Remove stop words #meaningful_words = [w for w in words if not w in stops] meaningful_words = [w for w in words] # # 6. Join the words back into one string separated by space, # and return the result. return( " ".join( meaningful_words )) ########################################################################## #Initialize stemme r: stemmer = SnowballStemmer("german") print('loading model...') model = gensim.models.KeyedVectors.load_word2vec_format('wiki.model.bin', binary=True) # C binary format print('done') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) directoryIn = sys.argv[1] cwd = os.getcwd() rechtsprechIn = os.listdir(cwd + '/' + directoryIn) train = [] print('writing every document as one line in a textfile ') for rechtsprech in rechtsprechIn: train_spacy = [] with open(cwd + '/' + directoryIn + rechtsprech) as Indok: lines = [] n = 0 for line in Indok: if len(str(line)[:-1]) >= 3: n += 1 print(n) word = dok_to_words(str(line)[:-1]) train_spacy.append(word) synonyms = [] try: momo = model.similar_by_word(word, topn=2) for element in momo: synonyms.append(element[0]) train_spacy += synonyms except KeyError: print('the word ' + word +' was not in the vocab') pass print(synonyms , '\n') print(word , '\n') #.append(word) #print(train_spacy) #lines += [model.similar_by_word(str(line)[:-1])] train.append(' '.join(train_spacy)) print(train) print(train) print(len(train)) print(len(train[1])) #momo = model.similar_by_word("Computer", topn=20) #twenty = [] #for element in momo: #twenty.append(element[0]) #print(twenty) #Initialize training data: #train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python') #df = pd.DataFrame(train) #print(df) #print(df.shape) #num_doks = train.size #print(num_doks) # Print the raw comment and then the output of get_text(), for # comparison #print('erste zeile',df[0].iloc[1]) #print("Cleaning and parsing the training set comments...\n") #clean_train_doks = [] #for i in range( 0, num_doks ): ## If the index is evenly divisible by 1000, print a message #if( (i+1)%1000 == 0 ): #print("comment %d of %d\n" % ( i+1, num_doks )) #clean_train_doks.append( dok_to_words( str(train[0].iloc[i] ))) #print(clean_train_doks) print("Creating the bag of words...\n") from sklearn.feature_extraction.text import CountVectorizer # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 9000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(train) from sklearn.externals import joblib joblib.dump(vectorizer, 'bagofwords.pkl') # Numpy arrays are easy to work with, so convert the result to an # array train_data_features = train_data_features.toarray() import hickle as hkl try: hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip') except MemoryError: print('There was a memoryerror regarding the hdf5 dump') pass try: file_numpy = open('OnesZerosDB.npy', 'wb') np.save(file_numpy, train_data_features) except Exception as e: print(traceback.format_exception(*sys.exc_info())) raise # reraises the exception try: file_pi = open('OnesZerosDB.bin', 'wb') cPickle.dump(train_data_features, file_pi) except MemoryError: print('There was a memorerror regarding the cpickle dump') pass #print(len(train_data_features)) #for m in train_data_features[1]: #print(m)