You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

171 lines
3.3 KiB

# prototype User Interface
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import scipy as sc
import tensorflow as tf
import _pickle as cPickle
import hickle as hkl
import os
# Define function to convert scipy csr matrix to tf tensor for working on gpu
def convert_sparse_matrix_to_sparse_tensor(X):
coo = sc.sparse.coo_matrix(X)
indices = np.mat([coo.row, coo.col]).transpose()
return tf.SparseTensorValue(indices, coo.data, coo.shape)
#Load the zeros and ones from the database
dbOZ = hkl.load('bagofwords/OnesZerosDB_gzip.hkl')
print(len(dbOZ))
# transpose with csr transpose
#dbOZ = dbOZ.transpose()
# as type with csr as type
dbOZ = dbOZ.astype('float32')
#print(type(convert_sparse_matrix_to_sparse_tensor(dbOZ)))
#print('bla',dbOZ)
#dbOZ = dbOZ.transpose()
#dbOZ = np.transpose(np.array(dbOZ).astype(np.float32, copy=False))
#print('bla',dbOZ)
#dbOZ.transpose()
# Get the user input
user_input_words = input("Please describe your problem: ")
user_input_n = int(input("How many dokuments would you like to display?: "))
# Convert user input to Zeros and Ones
user_array = []
user_array.append(user_input_words)
print(user_array)
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("german")
user_array = [stemmer.stem(word) for word in user_array]
print(user_array)
vectorizer = joblib.load('bagofwords/bagofwords.pkl')
user_input_OnesZeros = vectorizer.transform(user_array)
uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
#uiOZ = uOZ
#uiOZ = np.transpose(uOZ[np.newaxis, :])
uiOZ = uOZ[np.newaxis, :]
uiOZ = uiOZ.transpose()
#print('1', uiOZ)
#print('2', dbOZ)
sess = tf.Session()
with sess.as_default():
uiOZ_tensor = tf.constant(uiOZ)
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ)
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
wCD = np.array(wordCountDoku.eval())
indexedwCD = []
for n in range(len(wCD)):
indexedwCD.append([n,wCD[n][0]])
#print('0',indexedwCD)
#indexedwCD = np.transpose(np.array(indexedwCD))
#print('1',indexedwCD)
indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
#print('2', indexedwCD)
#print('2',indexedwCD[::-1])
#print(indexedwCD)
#print('Here come the fuckn best %d dokument/s that match your problem:' %(user_input_n))
best_n_documents = []
for n in range(user_input_n):
#print(indexedwCD[n][0])
best_n_documents.append(indexedwCD[n][0])
#print(best_n_documents)
cwd = os.getcwd()
#rechtsprechIn = os.listdir(cwd + '/' + 'EndDokumente')
rechtsprechIn = hkl.load('bagofwords/rechtsprechIn_gzip.hkl')
#print(rechtsprechIn)
from subprocess import call
for n in range(user_input_n):
call(['nano', cwd + '/' + 'EndDokumente/' + rechtsprechIn[int(best_n_documents[n])]])
# Calculate the best matching parallelized with tf
# Get the id of documents which fit the best
# Display the n best matching dokuments