laywerrobot/bagofwords/w2vBagOfWords.py

# Import the pandas package, then use the "read_csv" function to read
# the labeled training data

import numpy as np  
import pandas as pd 
#from bs4 import BeautifulSoup  
import re
import nltk

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # Import the stop word list


from sklearn.linear_model import SGDClassifier
from sklearn import svm
import scipy
from sklearn import preprocessing

import sys
import os

from gensim.models import word2vec
import logging
import gensim

import _pickle as cPickle

# Get the data directories

#directoryIn = sys.argv[1]
#directoryTrans = sys.argv[2]
#directoryOut = sys.argv[3]

#cwd = os.getcwd()

#rechtsprechIn = os.listdir(cwd + '/' + directoryIn)

#print('writing every document as one line in a textfile ')
#for rechtsprech in rechtsprechIn:
    
    #with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
        #with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:
            #print(Indok)
            #lines = []
            #for line in Indok:
                #lines += [str(line)[:-1]]
            #print(lines)
            #Transdok.write(' '.join(lines))
                ##print([lin])
                ##print([str(line)[:-1]])
            ##print(lines)
            #Transdok.write('\n')


def dok_to_words( raw_comment ):
    # Function to convert a raw comment to a string of words
    # The input is a single string (a raw comment), and 
    # the output is a single string (a preprocessed comment)
    #
    # 1. Remove HTML
    #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    print('words', words)
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    #stops = set(stopwords.words("german"))                  
    # 
    # 5. Remove stop words
    #meaningful_words = [w for w in words if not w in stops]   
    meaningful_words = [w for w in words]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   


##########################################################################


#Initialize stemme r:
stemmer = SnowballStemmer("german")

print('loading model...')
model = gensim.models.KeyedVectors.load_word2vec_format('wiki.model.bin', binary=True)  # C binary format
print('done')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


directoryIn = sys.argv[1]


cwd = os.getcwd()


rechtsprechIn = os.listdir(cwd + '/' + directoryIn)

train = []
print('writing every document as one line in a textfile ')
for rechtsprech in rechtsprechIn:
    train_spacy = []
    with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
        lines = []
        n = 0
        for line in Indok:
            if len(str(line)[:-1]) >= 3:
                n += 1
                print(n)
                word = dok_to_words(str(line)[:-1])
                train_spacy.append(word)
                synonyms = []
                try:
                    momo = model.similar_by_word(word, topn=2)
                    for element in momo:
                        synonyms.append(element[0])
                    train_spacy += synonyms
                except KeyError:
                    print('the word ' + word +' was not in the vocab')
                    pass
                
                
                print(synonyms , '\n')
                print(word , '\n')
                
                #.append(word)
                #print(train_spacy)
            #lines += [model.similar_by_word(str(line)[:-1])]
    
    train.append(' '.join(train_spacy))

print(train)
        
print(train)
print(len(train))
print(len(train[1]))

#momo = model.similar_by_word("Computer", topn=20)
#twenty = []
#for element in momo:
    #twenty.append(element[0])
#print(twenty)

#Initialize training data:
#train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')

#df = pd.DataFrame(train)

#print(df)
#print(df.shape)


#num_doks = train.size

#print(num_doks)


# Print the raw comment and then the output of get_text(), for 
# comparison


#print('erste zeile',df[0].iloc[1])


#print("Cleaning and parsing the training set comments...\n")
#clean_train_doks = []
#for i in range( 0, num_doks ):
    ## If the index is evenly divisible by 1000, print a message
    #if( (i+1)%1000 == 0 ):
        #print("comment %d of %d\n" % ( i+1, num_doks ))                                                                    
    #clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))
    
    
#print(clean_train_doks)

print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 9000)


# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(train)

from sklearn.externals import joblib

joblib.dump(vectorizer, 'bagofwords.pkl') 


# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

import hickle as hkl
try:
    hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip')
except MemoryError:
    print('There was a memoryerror regarding the hdf5 dump')
    pass

try:
    file_numpy = open('OnesZerosDB.npy', 'wb')
    np.save(file_numpy, train_data_features)
except Exception as e:
    print(traceback.format_exception(*sys.exc_info()))
    raise # reraises the exception


try:
    file_pi = open('OnesZerosDB.bin', 'wb')
    
    cPickle.dump(train_data_features, file_pi)
except MemoryError:
    print('There was a memorerror regarding the cpickle dump')
    pass


#print(len(train_data_features))
#for m in train_data_features[1]:
    #print(m)
first commit 2020-08-27 21:55:39 +02:00			`# Import the pandas package, then use the "read_csv" function to read`
			`# the labeled training data`

			`import numpy as np`
			`import pandas as pd`
			`#from bs4 import BeautifulSoup`
			`import re`
			`import nltk`

			`from nltk.stem.snowball import SnowballStemmer`
			`from nltk.corpus import stopwords # Import the stop word list`


			`from sklearn.linear_model import SGDClassifier`
			`from sklearn import svm`
			`import scipy`
			`from sklearn import preprocessing`

			`import sys`
			`import os`

			`from gensim.models import word2vec`
			`import logging`
			`import gensim`

			`import _pickle as cPickle`

			`# Get the data directories`

			`#directoryIn = sys.argv[1]`
			`#directoryTrans = sys.argv[2]`
			`#directoryOut = sys.argv[3]`

			`#cwd = os.getcwd()`

			`#rechtsprechIn = os.listdir(cwd + '/' + directoryIn)`

			`#print('writing every document as one line in a textfile ')`
			`#for rechtsprech in rechtsprechIn:`

			`#with open(cwd + '/' + directoryIn + rechtsprech) as Indok:`
			`#with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:`
			`#print(Indok)`
			`#lines = []`
			`#for line in Indok:`
			`#lines += [str(line)[:-1]]`
			`#print(lines)`
			`#Transdok.write(' '.join(lines))`
			`##print([lin])`
			`##print([str(line)[:-1]])`
			`##print(lines)`
			`#Transdok.write('\n')`



			`def dok_to_words( raw_comment ):`
			`# Function to convert a raw comment to a string of words`
			`# The input is a single string (a raw comment), and`
			`# the output is a single string (a preprocessed comment)`
			`#`
			`# 1. Remove HTML`
			`#comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()`
			`#`
			`# 2. Remove non-letters`
			`letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment)`
			`#`
			`# 3. Convert to lower case, split into individual words`
			`words = letters_only.lower().split()`

			`print('words', words)`
			`#`
			`# 4. In Python, searching a set is much faster than searching`
			`# a list, so convert the stop words to a set`
			`#stops = set(stopwords.words("german"))`
			`#`
			`# 5. Remove stop words`
			`#meaningful_words = [w for w in words if not w in stops]`
			`meaningful_words = [w for w in words]`
			`#`
			`# 6. Join the words back into one string separated by space,`
			`# and return the result.`
			`return( " ".join( meaningful_words ))`



			`##########################################################################`


			`#Initialize stemme r:`
			`stemmer = SnowballStemmer("german")`

			`print('loading model...')`
			`model = gensim.models.KeyedVectors.load_word2vec_format('wiki.model.bin', binary=True) # C binary format`
			`print('done')`
			`logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)`


			`directoryIn = sys.argv[1]`


			`cwd = os.getcwd()`


			`rechtsprechIn = os.listdir(cwd + '/' + directoryIn)`

			`train = []`
			`print('writing every document as one line in a textfile ')`
			`for rechtsprech in rechtsprechIn:`
			`train_spacy = []`
			`with open(cwd + '/' + directoryIn + rechtsprech) as Indok:`
			`lines = []`
			`n = 0`
			`for line in Indok:`
			`if len(str(line)[:-1]) >= 3:`
			`n += 1`
			`print(n)`
			`word = dok_to_words(str(line)[:-1])`
			`train_spacy.append(word)`
			`synonyms = []`
			`try:`
			`momo = model.similar_by_word(word, topn=2)`
			`for element in momo:`
			`synonyms.append(element[0])`
			`train_spacy += synonyms`
			`except KeyError:`
			`print('the word ' + word +' was not in the vocab')`
			`pass`


			`print(synonyms , '\n')`
			`print(word , '\n')`

			`#.append(word)`
			`#print(train_spacy)`
			`#lines += [model.similar_by_word(str(line)[:-1])]`

			`train.append(' '.join(train_spacy))`

			`print(train)`

			`print(train)`
			`print(len(train))`
			`print(len(train[1]))`

			`#momo = model.similar_by_word("Computer", topn=20)`
			`#twenty = []`
			`#for element in momo:`
			`#twenty.append(element[0])`
			`#print(twenty)`

			`#Initialize training data:`
			`#train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')`

			`#df = pd.DataFrame(train)`

			`#print(df)`
			`#print(df.shape)`





			`#num_doks = train.size`

			`#print(num_doks)`



			`# Print the raw comment and then the output of get_text(), for`
			`# comparison`





			`#print('erste zeile',df[0].iloc[1])`



			`#print("Cleaning and parsing the training set comments...\n")`
			`#clean_train_doks = []`
			`#for i in range( 0, num_doks ):`
			`## If the index is evenly divisible by 1000, print a message`
			`#if( (i+1)%1000 == 0 ):`
			`#print("comment %d of %d\n" % ( i+1, num_doks ))`
			`#clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))`


			`#print(clean_train_doks)`

			`print("Creating the bag of words...\n")`
			`from sklearn.feature_extraction.text import CountVectorizer`

			`# Initialize the "CountVectorizer" object, which is scikit-learn's`
			`# bag of words tool.`
			`vectorizer = CountVectorizer(analyzer = "word", \`
			`tokenizer = None, \`
			`preprocessor = None, \`
			`stop_words = None, \`
			`max_features = 9000)`


			`# fit_transform() does two functions: First, it fits the model`
			`# and learns the vocabulary; second, it transforms our training data`
			`# into feature vectors. The input to fit_transform should be a list of`
			`# strings.`
			`train_data_features = vectorizer.fit_transform(train)`

			`from sklearn.externals import joblib`

			`joblib.dump(vectorizer, 'bagofwords.pkl')`


			`# Numpy arrays are easy to work with, so convert the result to an`
			`# array`
			`train_data_features = train_data_features.toarray()`

			`import hickle as hkl`
			`try:`
			`hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip')`
			`except MemoryError:`
			`print('There was a memoryerror regarding the hdf5 dump')`
			`pass`

			`try:`
			`file_numpy = open('OnesZerosDB.npy', 'wb')`
			`np.save(file_numpy, train_data_features)`
			`except Exception as e:`
			`print(traceback.format_exception(*sys.exc_info()))`
			`raise # reraises the exception`


			`try:`
			`file_pi = open('OnesZerosDB.bin', 'wb')`

			`cPickle.dump(train_data_features, file_pi)`
			`except MemoryError:`
			`print('There was a memorerror regarding the cpickle dump')`
			`pass`


			`#print(len(train_data_features))`
			`#for m in train_data_features[1]:`
			`#print(m)`