commit da193f88897426270bc6e1a5ad909150bd1b7fa8 Author: alpcentaur Date: Thu Aug 27 21:05:52 2020 +0200 First commit of FASTsearch diff --git a/FASTsearch.py b/FASTsearch.py new file mode 100644 index 0000000..4858915 --- /dev/null +++ b/FASTsearch.py @@ -0,0 +1,97 @@ + +# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment. + +# TODO GPU Multithreading has to be implemented. + + + +# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. + +from sklearn.externals import joblib +from sklearn.feature_extraction.text import CountVectorizer + +import numpy as np +import scipy as sc + +import tensorflow as tf + +import _pickle as cPickle + +import hickle as hkl + +import os + + +# Define function to convert scipy csr matrix to tf tensor for working on gpu +def convert_sparse_matrix_to_sparse_tensor(X): + coo = sc.sparse.coo_matrix(X) + indices = np.mat([coo.row, coo.col]).transpose() + return tf.SparseTensorValue(indices, coo.data, coo.shape) + + + +class FASTsearch(object): + + def __init__(self, DatabaseDir, BoWModelDir): + + # input has to be hkl format + self.database = hkl.load(DatabaseDir).astype('float32') + + # input has to be pkl format + self.vectorizer = joblib.load(BoWModelDir) + + def search(self, string , numberofmatches): + + + numberofmatches = numberofmatches + + + # Convert user input to Zeros and Ones + user_array = [] + user_array.append(string) + + user_input_OnesZeros = self.vectorizer.transform(user_array) + uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) + + uiOZ = uOZ[np.newaxis, :] + + uiOZ = uiOZ.transpose() + + + sess = tf.Session() + + with sess.as_default(): + + uiOZ_tensor = tf.constant(uiOZ) + + dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ) + + #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + + + #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) + wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) + + wCD = np.array(wordCountDoku.eval()) + + + indexedwCD = [] + for n in range(len(wCD)): + indexedwCD.append([n,wCD[n][0]]) + + + indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) + + best_n_documents = [] + + + for n in range(numberofmatches): + + best_n_documents.append(indexedwCD[n][0]) + + + return best_n_documents + + +