alpcentaur
/
laywerrobot

# Import the pandas package, then use the "read_csv" function to read# the labeled training data
import numpy as np  import pandas as pd #from bs4 import BeautifulSoup  import re#import nltk
from nltk.stem.snowball import SnowballStemmer#from nltk.corpus import stopwords # Import the stop word list
import hickle as hkl
from sklearn.linear_model import SGDClassifierfrom sklearn import svmimport scipyfrom sklearn import preprocessing
import sysimport os
from gensim.models import word2vecimport loggingimport gensim
import _pickle as cPickle
# Get the data directories
#directoryIn = sys.argv[1]#directoryTrans = sys.argv[2]#directoryOut = sys.argv[3]
#cwd = os.getcwd()
#rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
#print('writing every document as one line in a textfile ')#for rechtsprech in rechtsprechIn:        #with open(cwd + '/' + directoryIn + rechtsprech) as Indok:        #with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:            #print(Indok)            #lines = []            #for line in Indok:                #lines += [str(line)[:-1]]            #print(lines)            #Transdok.write(' '.join(lines))                ##print([lin])                ##print([str(line)[:-1]])            ##print(lines)            #Transdok.write('\n')


def dok_to_words( raw_comment ):    # Function to convert a raw comment to a string of words    # The input is a single string (a raw comment), and     # the output is a single string (a preprocessed comment)    #    # 1. Remove HTML    #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()     #    # 2. Remove non-letters            letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment)     #    # 3. Convert to lower case, split into individual words    words = letters_only.split()                                     #print('words', words)    #    # 4. In Python, searching a set is much faster than searching    #   a list, so convert the stop words to a set    #stops = set(stopwords.words("german"))                      #     # 5. Remove stop words    #meaningful_words = [w for w in words if not w in stops]       meaningful_words = [w for w in words]       #    # 6. Join the words back into one string separated by space,     # and return the result.    return( " ".join( meaningful_words ))   


##########################################################################

#Initialize stemme r:stemmer = SnowballStemmer("german")
print('loading model...')model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True)  # C binary formatprint('done')logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

directoryIn = sys.argv[1]

cwd = os.getcwd()

rechtsprechIn = os.listdir(cwd + '/' + directoryIn)try:    hkl.dump(rechtsprechIn, 'rechtsprechIn_gzip.hkl', mode='w', compression='gzip')except MemoryError:    print('There was a memoryerror regarding the hdf5 dump, saving the directory as a table')    pass


counter = 0train = []print('writing every document as one line in a textfile ')for rechtsprech in rechtsprechIn:    train_spacy = []    with open(cwd + '/' + directoryIn + rechtsprech) as Indok:        lines = []                for line in Indok:            if len(str(line)[:-1]) >= 3:                                #print(n)                                word = dok_to_words(str(line)[:-1])                train_spacy.append(word)                train_spacy.append(stemmer.stem(word))                synonyms = []                try:                    momo = model.similar_by_word(word, topn=9)                                        for element in momo:                        synonyms.append(element[0])                                        #print(synonyms)                    #print(type(synonyms))                    train_spacy += [stemmer.stem(synonym) for synonym in synonyms]                except KeyError:                    print('the word ' + word +' was not in the vocab')                    pass                                                #print(synonyms , '\n')                #print(word , '\n')                                #.append(word)                            #lines += [model.similar_by_word(str(line)[:-1])]    counter += 1    print(counter)    setofwords = set(train_spacy)    new_train_spacy = list(setofwords)        train.append(' '.join(new_train_spacy))

    #with open('30Out/data', 'a') as doc:#    doc.write(str(train))
        #print('oi', train ,'oi ')#print(len(train))#print(len(train[1]))
#momo = model.similar_by_word("Computer", topn=20)#twenty = []#for element in momo:    #twenty.append(element[0])#print(twenty)
#Initialize training data:#train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')
#df = pd.DataFrame(train)
#print(df)#print(df.shape)


#num_doks = train.size
#print(num_doks)


# Print the raw comment and then the output of get_text(), for # comparison


#print('erste zeile',df[0].iloc[1])


#print("Cleaning and parsing the training set comments...\n")#clean_train_doks = []#for i in range( 0, num_doks ):    ## If the index is evenly divisible by 1000, print a message    #if( (i+1)%1000 == 0 ):        #print("comment %d of %d\n" % ( i+1, num_doks ))                                                                        #clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))        #print(clean_train_doks)
print("Creating the bag of words...\n")from sklearn.feature_extraction.text import CountVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's# bag of words tool.  vectorizer = CountVectorizer(analyzer = "word",   \                             tokenizer = None,    \                             preprocessor = None, \                             stop_words = None,   \                             max_features = 20000)

# fit_transform() does two functions: First, it fits the model# and learns the vocabulary; second, it transforms our training data# into feature vectors. The input to fit_transform should be a list of # strings.train_data_features = vectorizer.fit_transform(train)
from sklearn.externals import joblib
joblib.dump(vectorizer, 'bagofwords.pkl') 

# Numpy arrays are easy to work with, so convert the result to an # arraytrain_data_features = train_data_features.toarray()
try:    hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip')except MemoryError:    print('There was a memoryerror regarding the hdf5 dump')    pass
try:    file_numpy = open('OnesZerosDB.npy', 'wb')    np.save(file_numpy, train_data_features)except Exception as e:    print(traceback.format_exception(*sys.exc_info()))    raise # reraises the exception

try:    file_pi = open('OnesZerosDB.bin', 'wb')        cPickle.dump(train_data_features, file_pi)except MemoryError:    print('There was a memorerror regarding the cpickle dump')    pass

#print(len(train_data_features))#for m in train_data_features[1]:    #print(m)