# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment. # TODO GPU Multithreading has to be implemented. # USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. from sklearn.externals import joblib from sklearn.feature_extraction.text import CountVectorizer import numpy as np import scipy as sc import tensorflow as tf import _pickle as cPickle import hickle as hkl import os # Define function to convert scipy csr matrix to tf tensor for working on gpu def convert_sparse_matrix_to_sparse_tensor(X): coo = sc.sparse.coo_matrix(X) indices = np.mat([coo.row, coo.col]).transpose() return tf.SparseTensorValue(indices, coo.data, coo.shape) class FASTsearch(object): def __init__(self, DatabaseDir, BoWModelDir): # input has to be hkl format self.database = hkl.load(DatabaseDir).astype('float32') # input has to be pkl format self.vectorizer = joblib.load(BoWModelDir) def search(self, string , numberofmatches): numberofmatches = numberofmatches # Convert user input to Zeros and Ones user_array = [] user_array.append(string) user_input_OnesZeros = self.vectorizer.transform(user_array) uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) uiOZ = uOZ[np.newaxis, :] uiOZ = uiOZ.transpose() sess = tf.Session() with sess.as_default(): uiOZ_tensor = tf.constant(uiOZ) dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ) #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) wCD = np.array(wordCountDoku.eval()) indexedwCD = [] for n in range(len(wCD)): indexedwCD.append([n,wCD[n][0]]) indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) best_n_documents = [] for n in range(numberofmatches): best_n_documents.append(indexedwCD[n][0]) return best_n_documents