# prototype User Interface from sklearn.externals import joblib from sklearn.feature_extraction.text import CountVectorizer import numpy as np import scipy as sc import tensorflow as tf import _pickle as cPickle import hickle as hkl import os # Define function to convert scipy csr matrix to tf tensor for working on gpu def convert_sparse_matrix_to_sparse_tensor(X): coo = sc.sparse.coo_matrix(X) indices = np.mat([coo.row, coo.col]).transpose() return tf.SparseTensorValue(indices, coo.data, coo.shape) #Load the zeros and ones from the database dbOZ = hkl.load('bagofwords/OnesZerosDB_gzip.hkl') print(len(dbOZ)) # transpose with csr transpose #dbOZ = dbOZ.transpose() # as type with csr as type dbOZ = dbOZ.astype('float32') #print(type(convert_sparse_matrix_to_sparse_tensor(dbOZ))) #print('bla',dbOZ) #dbOZ = dbOZ.transpose() #dbOZ = np.transpose(np.array(dbOZ).astype(np.float32, copy=False)) #print('bla',dbOZ) #dbOZ.transpose() # Get the user input user_input_words = input("Please describe your problem: ") user_input_n = int(input("How many dokuments would you like to display?: ")) # Convert user input to Zeros and Ones user_array = [] user_array.append(user_input_words) print(user_array) from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("german") user_array = [stemmer.stem(word) for word in user_array] print(user_array) vectorizer = joblib.load('bagofwords/bagofwords.pkl') user_input_OnesZeros = vectorizer.transform(user_array) uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) #uiOZ = uOZ #uiOZ = np.transpose(uOZ[np.newaxis, :]) uiOZ = uOZ[np.newaxis, :] uiOZ = uiOZ.transpose() #print('1', uiOZ) #print('2', dbOZ) sess = tf.Session() with sess.as_default(): uiOZ_tensor = tf.constant(uiOZ) dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ) #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) wCD = np.array(wordCountDoku.eval()) indexedwCD = [] for n in range(len(wCD)): indexedwCD.append([n,wCD[n][0]]) #print('0',indexedwCD) #indexedwCD = np.transpose(np.array(indexedwCD)) #print('1',indexedwCD) indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) #print('2', indexedwCD) #print('2',indexedwCD[::-1]) #print(indexedwCD) #print('Here come the fuckn best %d dokument/s that match your problem:' %(user_input_n)) best_n_documents = [] for n in range(user_input_n): #print(indexedwCD[n][0]) best_n_documents.append(indexedwCD[n][0]) #print(best_n_documents) cwd = os.getcwd() #rechtsprechIn = os.listdir(cwd + '/' + 'EndDokumente') rechtsprechIn = hkl.load('bagofwords/rechtsprechIn_gzip.hkl') #print(rechtsprechIn) from subprocess import call for n in range(user_input_n): call(['nano', cwd + '/' + 'EndDokumente/' + rechtsprechIn[int(best_n_documents[n])]]) # Calculate the best matching parallelized with tf # Get the id of documents which fit the best # Display the n best matching dokuments