alpcentaur
/
laywerrobot


								# Import the pandas package, then use the "read_csv" function to read

								# the labeled training data


								import numpy as np

								import pandas as pd

								#from bs4 import BeautifulSoup

								import re

								import nltk


								from nltk.stem.snowball import SnowballStemmer

								from nltk.corpus import stopwords # Import the stop word list


								from sklearn.linear_model import SGDClassifier

								from sklearn import svm

								import scipy

								from sklearn import preprocessing


								import sys

								import os


								# Get the data directories


								directoryIn = sys.argv[1]

								directoryTrans = sys.argv[2]

								directoryOut = sys.argv[3]


								cwd = os.getcwd()


								rechtsprechIn = os.listdir(cwd + '/' + directoryIn)


								print('writing every document as one line in a textfile ')

								for rechtsprech in rechtsprechIn:


								    with open(cwd + '/' + directoryIn + rechtsprech) as Indok:

								        with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:

								            print(Indok)

								            lines = []

								            for line in Indok:

								                lines += [str(line)[:-1]]

								            print(lines)

								            Transdok.write(' '.join(lines))

								                #print([lin])

								                #print([str(line)[:-1]])

								            #print(lines)

								            Transdok.write('\n')


								##########################################################################


								#Initialize stemme r:

								stemmer = SnowballStemmer("german")


								#Initialize training data:

								train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')


								print(train)

								print(train.shape)


								num_doks = train.size


								print(num_doks)


								# Print the raw comment and then the output of get_text(), for

								# comparison


								def dok_to_words( raw_comment ):

								    # Function to convert a raw comment to a string of words

								    # The input is a single string (a raw comment), and

								    # the output is a single string (a preprocessed comment)

								    #

								    # 1. Remove HTML

								    #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()

								    #

								    # 2. Remove non-letters

								    letters_only = re.sub("[^a-zA-Züäö]", " ", raw_comment)

								    #

								    # 3. Convert to lower case, split into individual words

								    words = letters_only.lower().split()


								    print('words', words)

								    #

								    # 4. In Python, searching a set is much faster than searching

								    #   a list, so convert the stop words to a set

								    #stops = set(stopwords.words("german"))

								    #

								    # 5. Remove stop words

								    #meaningful_words = [w for w in words if not w in stops]

								    meaningful_words = [w for w in words]

								    #

								    # 6. Join the words back into one string separated by space,

								    # and return the result.

								    return( " ".join( meaningful_words ))


								print('erste zeile',train[0].iloc[1])


								print("Cleaning and parsing the training set comments...\n")

								clean_train_doks = []

								for i in range( 0, num_doks ):

								    # If the index is evenly divisible by 1000, print a message

								    if( (i+1)%1000 == 0 ):

								        print("comment %d of %d\n" % ( i+1, num_doks ))

								    clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))


								print(clean_train_doks)


								print("Creating the bag of words...\n")

								from sklearn.feature_extraction.text import CountVectorizer


								# Initialize the "CountVectorizer" object, which is scikit-learn's

								# bag of words tool.

								vectorizer = CountVectorizer(analyzer = "word",   \

								                             tokenizer = None,    \

								                             preprocessor = None, \

								                             stop_words = None,   \

								                             max_features = 9000)


								# fit_transform() does two functions: First, it fits the model

								# and learns the vocabulary; second, it transforms our training data

								# into feature vectors. The input to fit_transform should be a list of

								# strings.

								print(clean_train_doks)


								train_data_features = vectorizer.fit_transform(clean_train_doks)


								# Numpy arrays are easy to work with, so convert the result to an

								# array

								train_data_features = train_data_features.toarray()


								print(train_data_features)

								#print(len(train_data_features))

								#for m in train_data_features[1]:

								    #print(m)