alpcentaur
/
laywerrobot


								# Import the pandas package, then use the "read_csv" function to read

								# the labeled training data


								import numpy as np

								import pandas as pd

								#from bs4 import BeautifulSoup

								import re

								#import nltk


								from nltk.stem.snowball import SnowballStemmer

								#from nltk.corpus import stopwords # Import the stop word list


								import hickle as hkl


								from sklearn.linear_model import SGDClassifier

								from sklearn import svm

								import scipy

								from sklearn import preprocessing


								import sys

								import os


								from gensim.models import word2vec

								import logging

								import gensim


								import _pickle as cPickle


								# Get the data directories


								#directoryIn = sys.argv[1]

								#directoryTrans = sys.argv[2]

								#directoryOut = sys.argv[3]


								#cwd = os.getcwd()


								#rechtsprechIn = os.listdir(cwd + '/' + directoryIn)


								#print('writing every document as one line in a textfile ')

								#for rechtsprech in rechtsprechIn:


								    #with open(cwd + '/' + directoryIn + rechtsprech) as Indok:

								        #with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:

								            #print(Indok)

								            #lines = []

								            #for line in Indok:

								                #lines += [str(line)[:-1]]

								            #print(lines)

								            #Transdok.write(' '.join(lines))

								                ##print([lin])

								                ##print([str(line)[:-1]])

								            ##print(lines)

								            #Transdok.write('\n')


								def dok_to_words( raw_comment ):

								    # Function to convert a raw comment to a string of words

								    # The input is a single string (a raw comment), and

								    # the output is a single string (a preprocessed comment)

								    #

								    # 1. Remove HTML

								    #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()

								    #

								    # 2. Remove non-letters

								    letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment)

								    #

								    # 3. Convert to lower case, split into individual words

								    words = letters_only.split()


								    #print('words', words)

								    #

								    # 4. In Python, searching a set is much faster than searching

								    #   a list, so convert the stop words to a set

								    #stops = set(stopwords.words("german"))

								    #

								    # 5. Remove stop words

								    #meaningful_words = [w for w in words if not w in stops]

								    meaningful_words = [w for w in words]

								    #

								    # 6. Join the words back into one string separated by space,

								    # and return the result.

								    return( " ".join( meaningful_words ))


								##########################################################################


								#Initialize stemme r:

								stemmer = SnowballStemmer("german")


								print('loading model...')

								model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True)  # C binary format

								print('done')

								logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


								directoryIn = sys.argv[1]


								cwd = os.getcwd()


								rechtsprechIn = os.listdir(cwd + '/' + directoryIn)

								try:

								    hkl.dump(rechtsprechIn, 'rechtsprechIn_gzip.hkl', mode='w', compression='gzip')

								except MemoryError:

								    print('There was a memoryerror regarding the hdf5 dump, saving the directory as a table')

								    pass


								counter = 0

								train = []

								print('writing every document as one line in a textfile ')

								for rechtsprech in rechtsprechIn:

								    train_spacy = []

								    with open(cwd + '/' + directoryIn + rechtsprech) as Indok:

								        lines = []


								        for line in Indok:

								            if len(str(line)[:-1]) >= 3:


								                #print(n)


								                word = dok_to_words(str(line)[:-1])

								                train_spacy.append(word)

								                train_spacy.append(stemmer.stem(word))

								                synonyms = []

								                try:

								                    momo = model.similar_by_word(word, topn=9)


								                    for element in momo:

								                        synonyms.append(element[0])


								                    #print(synonyms)

								                    #print(type(synonyms))

								                    train_spacy += [stemmer.stem(synonym) for synonym in synonyms]

								                except KeyError:

								                    print('the word ' + word +' was not in the vocab')

								                    pass


								                #print(synonyms , '\n')

								                #print(word , '\n')


								                #.append(word)


								            #lines += [model.similar_by_word(str(line)[:-1])]

								    counter += 1

								    print(counter)

								    setofwords = set(train_spacy)

								    new_train_spacy = list(setofwords)


								    train.append(' '.join(new_train_spacy))


								#with open('30Out/data', 'a') as doc:

								#    doc.write(str(train))


								#print('oi', train ,'oi ')

								#print(len(train))

								#print(len(train[1]))


								#momo = model.similar_by_word("Computer", topn=20)

								#twenty = []

								#for element in momo:

								    #twenty.append(element[0])

								#print(twenty)


								#Initialize training data:

								#train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')


								#df = pd.DataFrame(train)


								#print(df)

								#print(df.shape)


								#num_doks = train.size


								#print(num_doks)


								# Print the raw comment and then the output of get_text(), for

								# comparison


								#print('erste zeile',df[0].iloc[1])


								#print("Cleaning and parsing the training set comments...\n")

								#clean_train_doks = []

								#for i in range( 0, num_doks ):

								    ## If the index is evenly divisible by 1000, print a message

								    #if( (i+1)%1000 == 0 ):

								        #print("comment %d of %d\n" % ( i+1, num_doks ))

								    #clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))


								#print(clean_train_doks)


								print("Creating the bag of words...\n")

								from sklearn.feature_extraction.text import CountVectorizer


								# Initialize the "CountVectorizer" object, which is scikit-learn's

								# bag of words tool.

								vectorizer = CountVectorizer(analyzer = "word",   \

								                             tokenizer = None,    \

								                             preprocessor = None, \

								                             stop_words = None,   \

								                             max_features = 20000)


								# fit_transform() does two functions: First, it fits the model

								# and learns the vocabulary; second, it transforms our training data

								# into feature vectors. The input to fit_transform should be a list of

								# strings.

								train_data_features = vectorizer.fit_transform(train)


								from sklearn.externals import joblib


								joblib.dump(vectorizer, 'bagofwords.pkl')


								# Numpy arrays are easy to work with, so convert the result to an

								# array

								train_data_features = train_data_features.toarray()


								try:

								    hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip')

								except MemoryError:

								    print('There was a memoryerror regarding the hdf5 dump')

								    pass


								try:

								    file_numpy = open('OnesZerosDB.npy', 'wb')

								    np.save(file_numpy, train_data_features)

								except Exception as e:

								    print(traceback.format_exception(*sys.exc_info()))

								    raise # reraises the exception


								try:

								    file_pi = open('OnesZerosDB.bin', 'wb')


								    cPickle.dump(train_data_features, file_pi)

								except MemoryError:

								    print('There was a memorerror regarding the cpickle dump')

								    pass


								#print(len(train_data_features))

								#for m in train_data_features[1]:

								    #print(m)