# Import the pandas package, then use the "read_csv" function to read # the labeled training data import numpy as np import pandas as pd #from bs4 import BeautifulSoup import re #import nltk from nltk.stem.snowball import SnowballStemmer #from nltk.corpus import stopwords # Import the stop word list import hickle as hkl from sklearn.linear_model import SGDClassifier from sklearn import svm import scipy from sklearn import preprocessing import sys import os from gensim.models import word2vec import logging import gensim import _pickle as cPickle # Get the data directories #directoryIn = sys.argv[1] #directoryTrans = sys.argv[2] #directoryOut = sys.argv[3] #cwd = os.getcwd() #rechtsprechIn = os.listdir(cwd + '/' + directoryIn) #print('writing every document as one line in a textfile ') #for rechtsprech in rechtsprechIn: #with open(cwd + '/' + directoryIn + rechtsprech) as Indok: #with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok: #print(Indok) #lines = [] #for line in Indok: #lines += [str(line)[:-1]] #print(lines) #Transdok.write(' '.join(lines)) ##print([lin]) ##print([str(line)[:-1]]) ##print(lines) #Transdok.write('\n') def dok_to_words( raw_comment ): # Function to convert a raw comment to a string of words # The input is a single string (a raw comment), and # the output is a single string (a preprocessed comment) # # 1. Remove HTML #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text() # # 2. Remove non-letters letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment) # # 3. Convert to lower case, split into individual words words = letters_only.split() #print('words', words) # # 4. In Python, searching a set is much faster than searching # a list, so convert the stop words to a set #stops = set(stopwords.words("german")) # # 5. Remove stop words #meaningful_words = [w for w in words if not w in stops] meaningful_words = [w for w in words] # # 6. Join the words back into one string separated by space, # and return the result. return( " ".join( meaningful_words )) ########################################################################## #Initialize stemme r: stemmer = SnowballStemmer("german") print('loading model...') model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True) # C binary format print('done') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) directoryIn = sys.argv[1] cwd = os.getcwd() rechtsprechIn = os.listdir(cwd + '/' + directoryIn) try: hkl.dump(rechtsprechIn, 'rechtsprechIn_gzip.hkl', mode='w', compression='gzip') except MemoryError: print('There was a memoryerror regarding the hdf5 dump, saving the directory as a table') pass counter = 0 train = [] print('writing every document as one line in a textfile ') for rechtsprech in rechtsprechIn: train_spacy = [] with open(cwd + '/' + directoryIn + rechtsprech) as Indok: lines = [] for line in Indok: if len(str(line)[:-1]) >= 3: #print(n) word = dok_to_words(str(line)[:-1]) train_spacy.append(word) train_spacy.append(stemmer.stem(word)) synonyms = [] try: momo = model.similar_by_word(word, topn=9) for element in momo: synonyms.append(element[0]) #print(synonyms) #print(type(synonyms)) train_spacy += [stemmer.stem(synonym) for synonym in synonyms] except KeyError: print('the word ' + word +' was not in the vocab') pass #print(synonyms , '\n') #print(word , '\n') #.append(word) #lines += [model.similar_by_word(str(line)[:-1])] counter += 1 print(counter) setofwords = set(train_spacy) new_train_spacy = list(setofwords) train.append(' '.join(new_train_spacy)) #with open('30Out/data', 'a') as doc: # doc.write(str(train)) #print('oi', train ,'oi ') #print(len(train)) #print(len(train[1])) #momo = model.similar_by_word("Computer", topn=20) #twenty = [] #for element in momo: #twenty.append(element[0]) #print(twenty) #Initialize training data: #train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python') #df = pd.DataFrame(train) #print(df) #print(df.shape) #num_doks = train.size #print(num_doks) # Print the raw comment and then the output of get_text(), for # comparison #print('erste zeile',df[0].iloc[1]) #print("Cleaning and parsing the training set comments...\n") #clean_train_doks = [] #for i in range( 0, num_doks ): ## If the index is evenly divisible by 1000, print a message #if( (i+1)%1000 == 0 ): #print("comment %d of %d\n" % ( i+1, num_doks )) #clean_train_doks.append( dok_to_words( str(train[0].iloc[i] ))) #print(clean_train_doks) print("Creating the bag of words...\n") from sklearn.feature_extraction.text import CountVectorizer # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 20000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(train) from sklearn.externals import joblib joblib.dump(vectorizer, 'bagofwords.pkl') # Numpy arrays are easy to work with, so convert the result to an # array train_data_features = train_data_features.toarray() try: hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip') except MemoryError: print('There was a memoryerror regarding the hdf5 dump') pass try: file_numpy = open('OnesZerosDB.npy', 'wb') np.save(file_numpy, train_data_features) except Exception as e: print(traceback.format_exception(*sys.exc_info())) raise # reraises the exception try: file_pi = open('OnesZerosDB.bin', 'wb') cPickle.dump(train_data_features, file_pi) except MemoryError: print('There was a memorerror regarding the cpickle dump') pass #print(len(train_data_features)) #for m in train_data_features[1]: #print(m)