You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

152 lines
4.1 KiB

4 years ago
  1. # Import the pandas package, then use the "read_csv" function to read
  2. # the labeled training data
  3. import numpy as np
  4. import pandas as pd
  5. #from bs4 import BeautifulSoup
  6. import re
  7. import nltk
  8. from nltk.stem.snowball import SnowballStemmer
  9. from nltk.corpus import stopwords # Import the stop word list
  10. from sklearn.linear_model import SGDClassifier
  11. from sklearn import svm
  12. import scipy
  13. from sklearn import preprocessing
  14. import sys
  15. import os
  16. # Get the data directories
  17. directoryIn = sys.argv[1]
  18. directoryTrans = sys.argv[2]
  19. directoryOut = sys.argv[3]
  20. cwd = os.getcwd()
  21. rechtsprechIn = os.listdir(cwd + '/' + directoryIn)
  22. print('writing every document as one line in a textfile ')
  23. for rechtsprech in rechtsprechIn:
  24. with open(cwd + '/' + directoryIn + rechtsprech) as Indok:
  25. with open(cwd + '/' + directoryTrans + 'Trans.txt', 'a') as Transdok:
  26. print(Indok)
  27. lines = []
  28. for line in Indok:
  29. lines += [str(line)[:-1]]
  30. print(lines)
  31. Transdok.write(' '.join(lines))
  32. #print([lin])
  33. #print([str(line)[:-1]])
  34. #print(lines)
  35. Transdok.write('\n')
  36. ##########################################################################
  37. #Initialize stemme r:
  38. stemmer = SnowballStemmer("german")
  39. #Initialize training data:
  40. train = pd.read_csv(cwd + '/' + directoryTrans + 'Trans.txt', delimiter='\n', header=None, engine='python')
  41. print(train)
  42. print(train.shape)
  43. num_doks = train.size
  44. print(num_doks)
  45. # Print the raw comment and then the output of get_text(), for
  46. # comparison
  47. def dok_to_words( raw_comment ):
  48. # Function to convert a raw comment to a string of words
  49. # The input is a single string (a raw comment), and
  50. # the output is a single string (a preprocessed comment)
  51. #
  52. # 1. Remove HTML
  53. #comment_text = BeautifulSoup(raw_comment, "html.parser").get_text()
  54. #
  55. # 2. Remove non-letters
  56. letters_only = re.sub("[^a-zA-Züäö]", " ", raw_comment)
  57. #
  58. # 3. Convert to lower case, split into individual words
  59. words = letters_only.lower().split()
  60. print('words', words)
  61. #
  62. # 4. In Python, searching a set is much faster than searching
  63. # a list, so convert the stop words to a set
  64. #stops = set(stopwords.words("german"))
  65. #
  66. # 5. Remove stop words
  67. #meaningful_words = [w for w in words if not w in stops]
  68. meaningful_words = [w for w in words]
  69. #
  70. # 6. Join the words back into one string separated by space,
  71. # and return the result.
  72. return( " ".join( meaningful_words ))
  73. print('erste zeile',train[0].iloc[1])
  74. print("Cleaning and parsing the training set comments...\n")
  75. clean_train_doks = []
  76. for i in range( 0, num_doks ):
  77. # If the index is evenly divisible by 1000, print a message
  78. if( (i+1)%1000 == 0 ):
  79. print("comment %d of %d\n" % ( i+1, num_doks ))
  80. clean_train_doks.append( dok_to_words( str(train[0].iloc[i] )))
  81. print(clean_train_doks)
  82. print("Creating the bag of words...\n")
  83. from sklearn.feature_extraction.text import CountVectorizer
  84. # Initialize the "CountVectorizer" object, which is scikit-learn's
  85. # bag of words tool.
  86. vectorizer = CountVectorizer(analyzer = "word", \
  87. tokenizer = None, \
  88. preprocessor = None, \
  89. stop_words = None, \
  90. max_features = 9000)
  91. # fit_transform() does two functions: First, it fits the model
  92. # and learns the vocabulary; second, it transforms our training data
  93. # into feature vectors. The input to fit_transform should be a list of
  94. # strings.
  95. print(clean_train_doks)
  96. train_data_features = vectorizer.fit_transform(clean_train_doks)
  97. # Numpy arrays are easy to work with, so convert the result to an
  98. # array
  99. train_data_features = train_data_features.toarray()
  100. print(train_data_features)
  101. #print(len(train_data_features))
  102. #for m in train_data_features[1]:
  103. #print(m)