You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

152 lines
4.1 KiB

# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import numpy as np
import pandas as pd
#from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # Import the stop word list
from sklearn.linear_model import SGDClassifier
from sklearn import svm
import scipy
from sklearn import preprocessing
import sys
import os
# Get the data directories
directoryIn = sys.argv[1]
directoryTrans = sys.argv[2]
directoryOut = sys.argv[3]
cwd = os.getcwd()
rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
print('writing every document as one line in a textfile ')
for rechtsprech in rechtsprechIn:
with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:
print(Indok)
lines = []
for line in Indok:
lines += [str(line)[:-1]]
print(lines)
Transdok.write(' '.join(lines))
#print([lin])
#print([str(line)[:-1]])
#print(lines)
Transdok.write('\n')
##########################################################################
#Initialize stemme r:
stemmer = SnowballStemmer("german")
#Initialize training data:
train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')
print(train)
print(train.shape)
num_doks = train.size
print(num_doks)
# Print the raw comment and then the output of get_text(), for
# comparison
def dok_to_words( raw_comment ):
# Function to convert a raw comment to a string of words
# The input is a single string (a raw comment), and
# the output is a single string (a preprocessed comment)
#
# 1. Remove HTML
#comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
#
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Züäö]", " ", raw_comment)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
print('words', words)
#
# 4. In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
#stops = set(stopwords.words("german"))
#
# 5. Remove stop words
#meaningful_words = [w for w in words if not w in stops]
meaningful_words = [w for w in words]
#
# 6. Join the words back into one string separated by space,
# and return the result.
return( " ".join( meaningful_words ))
print('erste zeile',train[0].iloc[1])
print("Cleaning and parsing the training set comments...\n")
clean_train_doks = []
for i in range( 0, num_doks ):
# If the index is evenly divisible by 1000, print a message
if( (i+1)%1000 == 0 ):
print("comment %d of %d\n" % ( i+1, num_doks ))
clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))
print(clean_train_doks)
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 9000)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
print(clean_train_doks)
train_data_features = vectorizer.fit_transform(clean_train_doks)
# Numpy arrays are easy to work with, so convert the result to an
# array
train_data_features = train_data_features.toarray()
print(train_data_features)
#print(len(train_data_features))
#for m in train_data_features[1]:
#print(m)