alpcentaur
/
laywerrobot

# Import the pandas package, then use the "read_csv" function to read# the labeled training data
import numpy as np  import pandas as pd #from bs4 import BeautifulSoup  import reimport nltk
from nltk.stem.snowball import SnowballStemmerfrom nltk.corpus import stopwords # Import the stop word list

from sklearn.linear_model import SGDClassifierfrom sklearn import svmimport scipyfrom sklearn import preprocessing
import sysimport os
# Get the data directories
directoryIn = sys.argv[1]directoryTrans = sys.argv[2]directoryOut = sys.argv[3]
cwd = os.getcwd()
rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
print('writing every document as one line in a textfile ')for rechtsprech in rechtsprechIn:        with open(cwd + '/' + directoryIn + rechtsprech) as Indok:        with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:            print(Indok)            lines = []            for line in Indok:                lines += [str(line)[:-1]]            print(lines)            Transdok.write(' '.join(lines))                #print([lin])                #print([str(line)[:-1]])            #print(lines)            Transdok.write('\n')


##########################################################################

#Initialize stemme r:stemmer = SnowballStemmer("german")

#Initialize training data:train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')
print(train)print(train.shape)


num_doks = train.size
print(num_doks)


# Print the raw comment and then the output of get_text(), for # comparison
def dok_to_words( raw_comment ):    # Function to convert a raw comment to a string of words    # The input is a single string (a raw comment), and     # the output is a single string (a preprocessed comment)    #    # 1. Remove HTML    #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()     #    # 2. Remove non-letters            letters_only = re.sub("[^a-zA-Züäö]", " ", raw_comment)     #    # 3. Convert to lower case, split into individual words    words = letters_only.lower().split()                                     print('words', words)    #    # 4. In Python, searching a set is much faster than searching    #   a list, so convert the stop words to a set    #stops = set(stopwords.words("german"))                      #     # 5. Remove stop words    #meaningful_words = [w for w in words if not w in stops]       meaningful_words = [w for w in words]       #    # 6. Join the words back into one string separated by space,     # and return the result.    return( " ".join( meaningful_words ))   


print('erste zeile',train[0].iloc[1])


print("Cleaning and parsing the training set comments...\n")clean_train_doks = []for i in range( 0, num_doks ):    # If the index is evenly divisible by 1000, print a message    if( (i+1)%1000 == 0 ):        print("comment %d of %d\n" % ( i+1, num_doks ))                                                                        clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))        print(clean_train_doks)
print("Creating the bag of words...\n")from sklearn.feature_extraction.text import CountVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's# bag of words tool.  vectorizer = CountVectorizer(analyzer = "word",   \                             tokenizer = None,    \                             preprocessor = None, \                             stop_words = None,   \                             max_features = 9000)

# fit_transform() does two functions: First, it fits the model# and learns the vocabulary; second, it transforms our training data# into feature vectors. The input to fit_transform should be a list of # strings.print(clean_train_doks)
train_data_features = vectorizer.fit_transform(clean_train_doks)
# Numpy arrays are easy to work with, so convert the result to an # arraytrain_data_features = train_data_features.toarray()
print(train_data_features)#print(len(train_data_features))#for m in train_data_features[1]:    #print(m)