|
|
- # Import the pandas package, then use the "read_csv" function to read
- # the labeled training data
-
- import numpy as np
- import pandas as pd
- #from bs4 import BeautifulSoup
- import re
- #import nltk
-
- from nltk.stem.snowball import SnowballStemmer
- #from nltk.corpus import stopwords # Import the stop word list
-
- import hickle as hkl
-
- from sklearn.linear_model import SGDClassifier
- from sklearn import svm
- import scipy
- from sklearn import preprocessing
-
- import sys
- import os
-
- from gensim.models import word2vec
- import logging
- import gensim
-
- import _pickle as cPickle
-
- # Get the data directories
-
- #directoryIn = sys.argv[1]
- #directoryTrans = sys.argv[2]
- #directoryOut = sys.argv[3]
-
- #cwd = os.getcwd()
-
- #rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
-
- #print('writing every document as one line in a textfile ')
- #for rechtsprech in rechtsprechIn:
-
- #with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
- #with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:
- #print(Indok)
- #lines = []
- #for line in Indok:
- #lines += [str(line)[:-1]]
- #print(lines)
- #Transdok.write(' '.join(lines))
- ##print([lin])
- ##print([str(line)[:-1]])
- ##print(lines)
- #Transdok.write('\n')
-
-
-
- def dok_to_words( raw_comment ):
- # Function to convert a raw comment to a string of words
- # The input is a single string (a raw comment), and
- # the output is a single string (a preprocessed comment)
- #
- # 1. Remove HTML
- #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
- #
- # 2. Remove non-letters
- letters_only = re.sub("[^a-zA-Züäöß]", " ", raw_comment)
- #
- # 3. Convert to lower case, split into individual words
- words = letters_only.split()
-
- #print('words', words)
- #
- # 4. In Python, searching a set is much faster than searching
- # a list, so convert the stop words to a set
- #stops = set(stopwords.words("german"))
- #
- # 5. Remove stop words
- #meaningful_words = [w for w in words if not w in stops]
- meaningful_words = [w for w in words]
- #
- # 6. Join the words back into one string separated by space,
- # and return the result.
- return( " ".join( meaningful_words ))
-
-
-
- ##########################################################################
-
-
- #Initialize stemme r:
- stemmer = SnowballStemmer("german")
-
- print('loading model...')
- model = gensim.models.KeyedVectors.load_word2vec_format('german.model.big', binary=True) # C binary format
- print('done')
- logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
-
-
- directoryIn = sys.argv[1]
-
-
- cwd = os.getcwd()
-
-
- rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
- try:
- hkl.dump(rechtsprechIn, 'rechtsprechIn_gzip.hkl', mode='w', compression='gzip')
- except MemoryError:
- print('There was a memoryerror regarding the hdf5 dump, saving the directory as a table')
- pass
-
-
-
- counter = 0
- train = []
- print('writing every document as one line in a textfile ')
- for rechtsprech in rechtsprechIn:
- train_spacy = []
- with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
- lines = []
-
- for line in Indok:
- if len(str(line)[:-1]) >= 3:
-
- #print(n)
-
- word = dok_to_words(str(line)[:-1])
- train_spacy.append(word)
- train_spacy.append(stemmer.stem(word))
- synonyms = []
- try:
- momo = model.similar_by_word(word, topn=9)
-
- for element in momo:
- synonyms.append(element[0])
-
- #print(synonyms)
- #print(type(synonyms))
- train_spacy += [stemmer.stem(synonym) for synonym in synonyms]
- except KeyError:
- print('the word ' + word +' was not in the vocab')
- pass
-
-
- #print(synonyms , '\n')
- #print(word , '\n')
-
- #.append(word)
-
- #lines += [model.similar_by_word(str(line)[:-1])]
- counter += 1
- print(counter)
- setofwords = set(train_spacy)
- new_train_spacy = list(setofwords)
-
- train.append(' '.join(new_train_spacy))
-
-
-
- #with open('30Out/data', 'a') as doc:
- # doc.write(str(train))
-
-
- #print('oi', train ,'oi ')
- #print(len(train))
- #print(len(train[1]))
-
- #momo = model.similar_by_word("Computer", topn=20)
- #twenty = []
- #for element in momo:
- #twenty.append(element[0])
- #print(twenty)
-
- #Initialize training data:
- #train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')
-
- #df = pd.DataFrame(train)
-
- #print(df)
- #print(df.shape)
-
-
-
-
-
- #num_doks = train.size
-
- #print(num_doks)
-
-
-
- # Print the raw comment and then the output of get_text(), for
- # comparison
-
-
-
-
-
- #print('erste zeile',df[0].iloc[1])
-
-
-
- #print("Cleaning and parsing the training set comments...\n")
- #clean_train_doks = []
- #for i in range( 0, num_doks ):
- ## If the index is evenly divisible by 1000, print a message
- #if( (i+1)%1000 == 0 ):
- #print("comment %d of %d\n" % ( i+1, num_doks ))
- #clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))
-
-
- #print(clean_train_doks)
-
- print("Creating the bag of words...\n")
- from sklearn.feature_extraction.text import CountVectorizer
-
- # Initialize the "CountVectorizer" object, which is scikit-learn's
- # bag of words tool.
- vectorizer = CountVectorizer(analyzer = "word", \
- tokenizer = None, \
- preprocessor = None, \
- stop_words = None, \
- max_features = 20000)
-
-
- # fit_transform() does two functions: First, it fits the model
- # and learns the vocabulary; second, it transforms our training data
- # into feature vectors. The input to fit_transform should be a list of
- # strings.
- train_data_features = vectorizer.fit_transform(train)
-
- from sklearn.externals import joblib
-
- joblib.dump(vectorizer, 'bagofwords.pkl')
-
-
- # Numpy arrays are easy to work with, so convert the result to an
- # array
- train_data_features = train_data_features.toarray()
-
- try:
- hkl.dump(train_data_features, 'OnesZerosDB_gzip.hkl', mode='w', compression='gzip')
- except MemoryError:
- print('There was a memoryerror regarding the hdf5 dump')
- pass
-
- try:
- file_numpy = open('OnesZerosDB.npy', 'wb')
- np.save(file_numpy, train_data_features)
- except Exception as e:
- print(traceback.format_exception(*sys.exc_info()))
- raise # reraises the exception
-
-
- try:
- file_pi = open('OnesZerosDB.bin', 'wb')
-
- cPickle.dump(train_data_features, file_pi)
- except MemoryError:
- print('There was a memorerror regarding the cpickle dump')
- pass
-
-
- #print(len(train_data_features))
- #for m in train_data_features[1]:
- #print(m)
|