# Import the pandas package, then use the "read_csv" function to read
# the labeled training data

import numpy as np  
import pandas as pd 
#from bs4 import BeautifulSoup  
import re
#import nltk

from nltk.stem.snowball import SnowballStemmer
#from nltk.corpus import stopwords # Import the stop word list

import hickle as hkl

from sklearn.linear_model import SGDClassifier
from sklearn import svm
import scipy
from sklearn import preprocessing

import sys
import os

from gensim.models import word2vec
import logging
import gensim

import _pickle as cPickle

# Get the data directories

#directoryIn = sys.argv[1]
#directoryTrans = sys.argv[2]
#directoryOut = sys.argv[3]

#cwd = os.getcwd()

#rechtsprechIn = os.listdir(cwd + '/' + directoryIn)

#print('writing every document as one line in a textfile ')
#for rechtsprech in rechtsprechIn:
    
    #with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
        #with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:
            #print(Indok)
            #lines = []
            #for line in Indok:
                #lines += [str(line)[:-1]]
            #print(lines)
            #Transdok.write(' '.join(lines))
                ##print([lin])
                ##print([str(line)[:-1]])
            ##print(lines)
            #Transdok.write('\n')


def dok_to_words( raw_comment ):
    # Function to convert a raw comment to a string of words
    # The input is a single string (a raw comment), and 
    # the output is a single string (a preprocessed comment)
    #
    # 1. Remove HTML
    #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.split()                             
    
    #print('words', words)
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    #stops = set(stopwords.words("german"))                  
    # 
    # 5. Remove stop words
    #meaningful_words = [w for w in words if not w in stops]   
    meaningful_words = [w for w in words]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   


##########################################################################


#Initialize stemme r:
stemmer = SnowballStemmer("german")

print('loading model...')
model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True)  # C binary format
print('done')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


directoryIn = sys.argv[1]


cwd = os.getcwd()


rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
try:
    hkl.dump(rechtsprechIn, 'rechtsprechIn_gzip.hkl', mode='w', compression='gzip')
except MemoryError:
    print('There was a memoryerror regarding the hdf5 dump, saving the directory as a table')
    pass


counter = 0
train = []
print('writing every document as one line in a textfile ')
for rechtsprech in rechtsprechIn:
    train_spacy = []
    with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
        lines = []
        
        for line in Indok:
            if len(str(line)[:-1]) >= 3:
                
                #print(n)
                
                word = dok_to_words(str(line)[:-1])
                train_spacy.append(word)
                train_spacy.append(stemmer.stem(word))
                synonyms = []
                try:
                    momo = model.similar_by_word(word, topn=9)
                    
                    for element in momo:
                        synonyms.append(element[0])
                    
                    #print(synonyms)
                    #print(type(synonyms))
                    train_spacy += [stemmer.stem(synonym) for synonym in synonyms]
                except KeyError:
                    print('the word ' + word +' was not in the vocab')
                    pass
                
                
                #print(synonyms , '\n')
                #print(word , '\n')
                
                #.append(word)
                
            #lines += [model.similar_by_word(str(line)[:-1])]
    counter += 1
    print(counter)
    setofwords = set(train_spacy)
    new_train_spacy = list(setofwords)
    
    train.append(' '.join(new_train_spacy))


#with open('30Out/data', 'a') as doc:
#    doc.write(str(train))

        
#print('oi', train ,'oi ')
#print(len(train))
#print(len(train[1]))

#momo = model.similar_by_word("Computer", topn=20)
#twenty = []
#for element in momo:
    #twenty.append(element[0])
#print(twenty)

#Initialize training data:
#train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')

#df = pd.DataFrame(train)

#print(df)
#print(df.shape)


#num_doks = train.size

#print(num_doks)


# Print the raw comment and then the output of get_text(), for 
# comparison


#print('erste zeile',df[0].iloc[1])


#print("Cleaning and parsing the training set comments...\n")
#clean_train_doks = []
#for i in range( 0, num_doks ):
    ## If the index is evenly divisible by 1000, print a message
    #if( (i+1)%1000 == 0 ):
        #print("comment %d of %d\n" % ( i+1, num_doks ))                                                                    
    #clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))
    
    
#print(clean_train_doks)

print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 20000)


# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(train)

from sklearn.externals import joblib

joblib.dump(vectorizer, 'bagofwords.pkl') 


# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

try:
    hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip')
except MemoryError:
    print('There was a memoryerror regarding the hdf5 dump')
    pass

try:
    file_numpy = open('OnesZerosDB.npy', 'wb')
    np.save(file_numpy, train_data_features)
except Exception as e:
    print(traceback.format_exception(*sys.exc_info()))
    raise # reraises the exception


try:
    file_pi = open('OnesZerosDB.bin', 'wb')
    
    cPickle.dump(train_data_features, file_pi)
except MemoryError:
    print('There was a memorerror regarding the cpickle dump')
    pass


#print(len(train_data_features))
#for m in train_data_features[1]:
    #print(m)