laywerrobot/bagofwords/w2vBagOfWords.py

245 lines
6.4 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import numpy as np
import pandas as pd
#from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # Import the stop word list
from sklearn.linear_model import SGDClassifier
from sklearn import svm
import scipy
from sklearn import preprocessing
import sys
import os
from gensim.models import word2vec
import logging
import gensim
import _pickle as cPickle
# Get the data directories
#directoryIn = sys.argv[1]
#directoryTrans = sys.argv[2]
#directoryOut = sys.argv[3]
#cwd = os.getcwd()
#rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
#print('writing every document as one line in a textfile ')
#for rechtsprech in rechtsprechIn:
#with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
#with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:
#print(Indok)
#lines = []
#for line in Indok:
#lines += [str(line)[:-1]]
#print(lines)
#Transdok.write(' '.join(lines))
##print([lin])
##print([str(line)[:-1]])
##print(lines)
#Transdok.write('\n')
def dok_to_words( raw_comment ):
# Function to convert a raw comment to a string of words
# The input is a single string (a raw comment), and
# the output is a single string (a preprocessed comment)
#
# 1. Remove HTML
#comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
#
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
print('words', words)
#
# 4. In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
#stops = set(stopwords.words("german"))
#
# 5. Remove stop words
#meaningful_words = [w for w in words if not w in stops]
meaningful_words = [w for w in words]
#
# 6. Join the words back into one string separated by space,
# and return the result.
return( " ".join( meaningful_words ))
##########################################################################
#Initialize stemme r:
stemmer = SnowballStemmer("german")
print('loading model...')
model = gensim.models.KeyedVectors.load_word2vec_format('wiki.model.bin', binary=True) # C binary format
print('done')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
directoryIn = sys.argv[1]
cwd = os.getcwd()
rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
train = []
print('writing every document as one line in a textfile ')
for rechtsprech in rechtsprechIn:
train_spacy = []
with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
lines = []
n = 0
for line in Indok:
if len(str(line)[:-1]) >= 3:
n += 1
print(n)
word = dok_to_words(str(line)[:-1])
train_spacy.append(word)
synonyms = []
try:
momo = model.similar_by_word(word, topn=2)
for element in momo:
synonyms.append(element[0])
train_spacy += synonyms
except KeyError:
print('the word ' + word +' was not in the vocab')
pass
print(synonyms , '\n')
print(word , '\n')
#.append(word)
#print(train_spacy)
#lines += [model.similar_by_word(str(line)[:-1])]
train.append(' '.join(train_spacy))
print(train)
print(train)
print(len(train))
print(len(train[1]))
#momo = model.similar_by_word("Computer", topn=20)
#twenty = []
#for element in momo:
#twenty.append(element[0])
#print(twenty)
#Initialize training data:
#train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')
#df = pd.DataFrame(train)
#print(df)
#print(df.shape)
#num_doks = train.size
#print(num_doks)
# Print the raw comment and then the output of get_text(), for
# comparison
#print('erste zeile',df[0].iloc[1])
#print("Cleaning and parsing the training set comments...\n")
#clean_train_doks = []
#for i in range( 0, num_doks ):
## If the index is evenly divisible by 1000, print a message
#if( (i+1)%1000 == 0 ):
#print("comment %d of %d\n" % ( i+1, num_doks ))
#clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))
#print(clean_train_doks)
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 9000)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
train_data_features = vectorizer.fit_transform(train)
from sklearn.externals import joblib
joblib.dump(vectorizer, 'bagofwords.pkl')
# Numpy arrays are easy to work with, so convert the result to an
# array
train_data_features = train_data_features.toarray()
import hickle as hkl
try:
hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip')
except MemoryError:
print('There was a memoryerror regarding the hdf5 dump')
pass
try:
file_numpy = open('OnesZerosDB.npy', 'wb')
np.save(file_numpy, train_data_features)
except Exception as e:
print(traceback.format_exception(*sys.exc_info()))
raise # reraises the exception
try:
file_pi = open('OnesZerosDB.bin', 'wb')
cPickle.dump(train_data_features, file_pi)
except MemoryError:
print('There was a memorerror regarding the cpickle dump')
pass
#print(len(train_data_features))
#for m in train_data_features[1]:
#print(m)