# Import the pandas package, then use the "read_csv" function to read
|
|
# the labeled training data
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
#from bs4 import BeautifulSoup
|
|
import re
|
|
#import nltk
|
|
|
|
from nltk.stem.snowball import SnowballStemmer
|
|
#from nltk.corpus import stopwords # Import the stop word list
|
|
|
|
import hickle as hkl
|
|
|
|
from sklearn.linear_model import SGDClassifier
|
|
from sklearn import svm
|
|
import scipy
|
|
from sklearn import preprocessing
|
|
|
|
import sys
|
|
import os
|
|
|
|
from gensim.models import word2vec
|
|
import logging
|
|
import gensim
|
|
|
|
import _pickle as cPickle
|
|
|
|
# Get the data directories
|
|
|
|
#directoryIn = sys.argv[1]
|
|
#directoryTrans = sys.argv[2]
|
|
#directoryOut = sys.argv[3]
|
|
|
|
#cwd = os.getcwd()
|
|
|
|
#rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
|
|
|
|
#print('writing every document as one line in a textfile ')
|
|
#for rechtsprech in rechtsprechIn:
|
|
|
|
#with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
|
|
#with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:
|
|
#print(Indok)
|
|
#lines = []
|
|
#for line in Indok:
|
|
#lines += [str(line)[:-1]]
|
|
#print(lines)
|
|
#Transdok.write(' '.join(lines))
|
|
##print([lin])
|
|
##print([str(line)[:-1]])
|
|
##print(lines)
|
|
#Transdok.write('\n')
|
|
|
|
|
|
|
|
def dok_to_words( raw_comment ):
|
|
# Function to convert a raw comment to a string of words
|
|
# The input is a single string (a raw comment), and
|
|
# the output is a single string (a preprocessed comment)
|
|
#
|
|
# 1. Remove HTML
|
|
#comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
|
|
#
|
|
# 2. Remove non-letters
|
|
letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment)
|
|
#
|
|
# 3. Convert to lower case, split into individual words
|
|
words = letters_only.split()
|
|
|
|
#print('words', words)
|
|
#
|
|
# 4. In Python, searching a set is much faster than searching
|
|
# a list, so convert the stop words to a set
|
|
#stops = set(stopwords.words("german"))
|
|
#
|
|
# 5. Remove stop words
|
|
#meaningful_words = [w for w in words if not w in stops]
|
|
meaningful_words = [w for w in words]
|
|
#
|
|
# 6. Join the words back into one string separated by space,
|
|
# and return the result.
|
|
return( " ".join( meaningful_words ))
|
|
|
|
|
|
|
|
##########################################################################
|
|
|
|
|
|
#Initialize stemme r:
|
|
stemmer = SnowballStemmer("german")
|
|
|
|
print('loading model...')
|
|
model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True) # C binary format
|
|
print('done')
|
|
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
|
|
|
|
|
|
directoryIn = sys.argv[1]
|
|
|
|
|
|
cwd = os.getcwd()
|
|
|
|
|
|
rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
|
|
try:
|
|
hkl.dump(rechtsprechIn, 'rechtsprechIn_gzip.hkl', mode='w', compression='gzip')
|
|
except MemoryError:
|
|
print('There was a memoryerror regarding the hdf5 dump, saving the directory as a table')
|
|
pass
|
|
|
|
|
|
|
|
counter = 0
|
|
train = []
|
|
print('writing every document as one line in a textfile ')
|
|
for rechtsprech in rechtsprechIn:
|
|
train_spacy = []
|
|
with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
|
|
lines = []
|
|
|
|
for line in Indok:
|
|
if len(str(line)[:-1]) >= 3:
|
|
|
|
#print(n)
|
|
|
|
word = dok_to_words(str(line)[:-1])
|
|
train_spacy.append(word)
|
|
train_spacy.append(stemmer.stem(word))
|
|
synonyms = []
|
|
try:
|
|
momo = model.similar_by_word(word, topn=9)
|
|
|
|
for element in momo:
|
|
synonyms.append(element[0])
|
|
|
|
#print(synonyms)
|
|
#print(type(synonyms))
|
|
train_spacy += [stemmer.stem(synonym) for synonym in synonyms]
|
|
except KeyError:
|
|
print('the word ' + word +' was not in the vocab')
|
|
pass
|
|
|
|
|
|
#print(synonyms , '\n')
|
|
#print(word , '\n')
|
|
|
|
#.append(word)
|
|
|
|
#lines += [model.similar_by_word(str(line)[:-1])]
|
|
counter += 1
|
|
print(counter)
|
|
setofwords = set(train_spacy)
|
|
new_train_spacy = list(setofwords)
|
|
|
|
train.append(' '.join(new_train_spacy))
|
|
|
|
|
|
|
|
#with open('30Out/data', 'a') as doc:
|
|
# doc.write(str(train))
|
|
|
|
|
|
#print('oi', train ,'oi ')
|
|
#print(len(train))
|
|
#print(len(train[1]))
|
|
|
|
#momo = model.similar_by_word("Computer", topn=20)
|
|
#twenty = []
|
|
#for element in momo:
|
|
#twenty.append(element[0])
|
|
#print(twenty)
|
|
|
|
#Initialize training data:
|
|
#train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')
|
|
|
|
#df = pd.DataFrame(train)
|
|
|
|
#print(df)
|
|
#print(df.shape)
|
|
|
|
|
|
|
|
|
|
|
|
#num_doks = train.size
|
|
|
|
#print(num_doks)
|
|
|
|
|
|
|
|
# Print the raw comment and then the output of get_text(), for
|
|
# comparison
|
|
|
|
|
|
|
|
|
|
|
|
#print('erste zeile',df[0].iloc[1])
|
|
|
|
|
|
|
|
#print("Cleaning and parsing the training set comments...\n")
|
|
#clean_train_doks = []
|
|
#for i in range( 0, num_doks ):
|
|
## If the index is evenly divisible by 1000, print a message
|
|
#if( (i+1)%1000 == 0 ):
|
|
#print("comment %d of %d\n" % ( i+1, num_doks ))
|
|
#clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))
|
|
|
|
|
|
#print(clean_train_doks)
|
|
|
|
print("Creating the bag of words...\n")
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
# Initialize the "CountVectorizer" object, which is scikit-learn's
|
|
# bag of words tool.
|
|
vectorizer = CountVectorizer(analyzer = "word", \
|
|
tokenizer = None, \
|
|
preprocessor = None, \
|
|
stop_words = None, \
|
|
max_features = 20000)
|
|
|
|
|
|
# fit_transform() does two functions: First, it fits the model
|
|
# and learns the vocabulary; second, it transforms our training data
|
|
# into feature vectors. The input to fit_transform should be a list of
|
|
# strings.
|
|
train_data_features = vectorizer.fit_transform(train)
|
|
|
|
from sklearn.externals import joblib
|
|
|
|
joblib.dump(vectorizer, 'bagofwords.pkl')
|
|
|
|
|
|
# Numpy arrays are easy to work with, so convert the result to an
|
|
# array
|
|
train_data_features = train_data_features.toarray()
|
|
|
|
try:
|
|
hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip')
|
|
except MemoryError:
|
|
print('There was a memoryerror regarding the hdf5 dump')
|
|
pass
|
|
|
|
try:
|
|
file_numpy = open('OnesZerosDB.npy', 'wb')
|
|
np.save(file_numpy, train_data_features)
|
|
except Exception as e:
|
|
print(traceback.format_exception(*sys.exc_info()))
|
|
raise # reraises the exception
|
|
|
|
|
|
try:
|
|
file_pi = open('OnesZerosDB.bin', 'wb')
|
|
|
|
cPickle.dump(train_data_features, file_pi)
|
|
except MemoryError:
|
|
print('There was a memorerror regarding the cpickle dump')
|
|
pass
|
|
|
|
|
|
#print(len(train_data_features))
|
|
#for m in train_data_features[1]:
|
|
#print(m)
|