# Import the pandas package, then use the "read_csv" function to read
|
|
# the labeled training data
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
#from bs4 import BeautifulSoup
|
|
import re
|
|
import nltk
|
|
|
|
from nltk.stem.snowball import SnowballStemmer
|
|
from nltk.corpus import stopwords # Import the stop word list
|
|
|
|
|
|
from sklearn.linear_model import SGDClassifier
|
|
from sklearn import svm
|
|
import scipy
|
|
from sklearn import preprocessing
|
|
|
|
import sys
|
|
import os
|
|
|
|
# Get the data directories
|
|
|
|
directoryIn = sys.argv[1]
|
|
directoryTrans = sys.argv[2]
|
|
directoryOut = sys.argv[3]
|
|
|
|
cwd = os.getcwd()
|
|
|
|
rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
|
|
|
|
print('writing every document as one line in a textfile ')
|
|
for rechtsprech in rechtsprechIn:
|
|
|
|
with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
|
|
with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:
|
|
print(Indok)
|
|
lines = []
|
|
for line in Indok:
|
|
lines += [str(line)[:-1]]
|
|
print(lines)
|
|
Transdok.write(' '.join(lines))
|
|
#print([lin])
|
|
#print([str(line)[:-1]])
|
|
#print(lines)
|
|
Transdok.write('\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
##########################################################################
|
|
|
|
|
|
#Initialize stemme r:
|
|
stemmer = SnowballStemmer("german")
|
|
|
|
|
|
#Initialize training data:
|
|
train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')
|
|
|
|
print(train)
|
|
print(train.shape)
|
|
|
|
|
|
|
|
|
|
|
|
num_doks = train.size
|
|
|
|
print(num_doks)
|
|
|
|
|
|
|
|
# Print the raw comment and then the output of get_text(), for
|
|
# comparison
|
|
|
|
def dok_to_words( raw_comment ):
|
|
# Function to convert a raw comment to a string of words
|
|
# The input is a single string (a raw comment), and
|
|
# the output is a single string (a preprocessed comment)
|
|
#
|
|
# 1. Remove HTML
|
|
#comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
|
|
#
|
|
# 2. Remove non-letters
|
|
letters_only = re.sub("[^a-zA-Züäö]", " ", raw_comment)
|
|
#
|
|
# 3. Convert to lower case, split into individual words
|
|
words = letters_only.lower().split()
|
|
|
|
print('words', words)
|
|
#
|
|
# 4. In Python, searching a set is much faster than searching
|
|
# a list, so convert the stop words to a set
|
|
#stops = set(stopwords.words("german"))
|
|
#
|
|
# 5. Remove stop words
|
|
#meaningful_words = [w for w in words if not w in stops]
|
|
meaningful_words = [w for w in words]
|
|
#
|
|
# 6. Join the words back into one string separated by space,
|
|
# and return the result.
|
|
return( " ".join( meaningful_words ))
|
|
|
|
|
|
|
|
|
|
print('erste zeile',train[0].iloc[1])
|
|
|
|
|
|
|
|
print("Cleaning and parsing the training set comments...\n")
|
|
clean_train_doks = []
|
|
for i in range( 0, num_doks ):
|
|
# If the index is evenly divisible by 1000, print a message
|
|
if( (i+1)%1000 == 0 ):
|
|
print("comment %d of %d\n" % ( i+1, num_doks ))
|
|
clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))
|
|
|
|
|
|
print(clean_train_doks)
|
|
|
|
print("Creating the bag of words...\n")
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
# Initialize the "CountVectorizer" object, which is scikit-learn's
|
|
# bag of words tool.
|
|
vectorizer = CountVectorizer(analyzer = "word", \
|
|
tokenizer = None, \
|
|
preprocessor = None, \
|
|
stop_words = None, \
|
|
max_features = 9000)
|
|
|
|
|
|
# fit_transform() does two functions: First, it fits the model
|
|
# and learns the vocabulary; second, it transforms our training data
|
|
# into feature vectors. The input to fit_transform should be a list of
|
|
# strings.
|
|
print(clean_train_doks)
|
|
|
|
train_data_features = vectorizer.fit_transform(clean_train_doks)
|
|
|
|
# Numpy arrays are easy to work with, so convert the result to an
|
|
# array
|
|
train_data_features = train_data_features.toarray()
|
|
|
|
print(train_data_features)
|
|
#print(len(train_data_features))
|
|
#for m in train_data_features[1]:
|
|
#print(m)
|