|
|
- # Import the pandas package, then use the "read_csv" function to read
- # the labeled training data
-
- import numpy as np
- import pandas as pd
- #from bs4 import BeautifulSoup
- import re
- import nltk
-
- from nltk.stem.snowball import SnowballStemmer
- from nltk.corpus import stopwords # Import the stop word list
-
-
- from sklearn.linear_model import SGDClassifier
- from sklearn import svm
- import scipy
- from sklearn import preprocessing
-
- import sys
- import os
-
- # Get the data directories
-
- directoryIn = sys.argv[1]
- directoryTrans = sys.argv[2]
- directoryOut = sys.argv[3]
-
- cwd = os.getcwd()
-
- rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
-
- print('writing every document as one line in a textfile ')
- for rechtsprech in rechtsprechIn:
-
- with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
- with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:
- print(Indok)
- lines = []
- for line in Indok:
- lines += [str(line)[:-1]]
- print(lines)
- Transdok.write(' '.join(lines))
- #print([lin])
- #print([str(line)[:-1]])
- #print(lines)
- Transdok.write('\n')
-
-
-
-
-
-
- ##########################################################################
-
-
- #Initialize stemme r:
- stemmer = SnowballStemmer("german")
-
-
- #Initialize training data:
- train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')
-
- print(train)
- print(train.shape)
-
-
-
-
-
- num_doks = train.size
-
- print(num_doks)
-
-
-
- # Print the raw comment and then the output of get_text(), for
- # comparison
-
- def dok_to_words( raw_comment ):
- # Function to convert a raw comment to a string of words
- # The input is a single string (a raw comment), and
- # the output is a single string (a preprocessed comment)
- #
- # 1. Remove HTML
- #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
- #
- # 2. Remove non-letters
- letters_only = re.sub("[^a-zA-Züäö]", " ", raw_comment)
- #
- # 3. Convert to lower case, split into individual words
- words = letters_only.lower().split()
-
- print('words', words)
- #
- # 4. In Python, searching a set is much faster than searching
- # a list, so convert the stop words to a set
- #stops = set(stopwords.words("german"))
- #
- # 5. Remove stop words
- #meaningful_words = [w for w in words if not w in stops]
- meaningful_words = [w for w in words]
- #
- # 6. Join the words back into one string separated by space,
- # and return the result.
- return( " ".join( meaningful_words ))
-
-
-
-
- print('erste zeile',train[0].iloc[1])
-
-
-
- print("Cleaning and parsing the training set comments...\n")
- clean_train_doks = []
- for i in range( 0, num_doks ):
- # If the index is evenly divisible by 1000, print a message
- if( (i+1)%1000 == 0 ):
- print("comment %d of %d\n" % ( i+1, num_doks ))
- clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))
-
-
- print(clean_train_doks)
-
- print("Creating the bag of words...\n")
- from sklearn.feature_extraction.text import CountVectorizer
-
- # Initialize the "CountVectorizer" object, which is scikit-learn's
- # bag of words tool.
- vectorizer = CountVectorizer(analyzer = "word", \
- tokenizer = None, \
- preprocessor = None, \
- stop_words = None, \
- max_features = 9000)
-
-
- # fit_transform() does two functions: First, it fits the model
- # and learns the vocabulary; second, it transforms our training data
- # into feature vectors. The input to fit_transform should be a list of
- # strings.
- print(clean_train_doks)
-
- train_data_features = vectorizer.fit_transform(clean_train_doks)
-
- # Numpy arrays are easy to work with, so convert the result to an
- # array
- train_data_features = train_data_features.toarray()
-
- print(train_data_features)
- #print(len(train_data_features))
- #for m in train_data_features[1]:
- #print(m)
|