First commit of FASTsearch

2020-08-27 21:05:52 +02:00 · 2020-08-27 21:05:52 +02:00 · da193f8889
commit da193f8889
1 changed files with 97 additions and 0 deletions
--- a/FASTsearch.py
+++ b/FASTsearch.py
@ -0,0 +1,97 @@
+
+# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment.
+
+# TODO GPU Multithreading has to be implemented.
+
+
+
+# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. 
+
+from sklearn.externals import joblib
+from sklearn.feature_extraction.text import CountVectorizer
+
+import numpy as np
+import scipy as sc
+
+import tensorflow as tf
+
+import _pickle as cPickle
+
+import hickle as hkl
+
+import os
+
+
+# Define function to convert scipy csr matrix to tf tensor for working on gpu
+def convert_sparse_matrix_to_sparse_tensor(X):
+    coo = sc.sparse.coo_matrix(X)
+    indices = np.mat([coo.row, coo.col]).transpose()
+    return tf.SparseTensorValue(indices, coo.data, coo.shape)
+
+
+
+class FASTsearch(object):
+    
+    def __init__(self, DatabaseDir, BoWModelDir):
+        
+        # input has to be hkl format
+        self.database = hkl.load(DatabaseDir).astype('float32')
+        
+        # input has to be pkl format
+        self.vectorizer = joblib.load(BoWModelDir)
+
+    def search(self, string , numberofmatches):
+        
+        
+        numberofmatches = numberofmatches
+        
+        
+        # Convert user input to Zeros and Ones
+        user_array = []
+        user_array.append(string)
+
+        user_input_OnesZeros = self.vectorizer.transform(user_array)
+        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
+
+        uiOZ = uOZ[np.newaxis, :]
+
+        uiOZ = uiOZ.transpose()
+
+
+        sess = tf.Session()
+
+        with sess.as_default():
+
+            uiOZ_tensor = tf.constant(uiOZ)
+            
+            dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ)
+            
+            #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
+            #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
+            
+            
+            #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)
+            wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
+            
+            wCD = np.array(wordCountDoku.eval())
+            
+
+        indexedwCD = []
+        for n in range(len(wCD)):
+            indexedwCD.append([n,wCD[n][0]])
+
+
+        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
+
+        best_n_documents = []
+
+
+        for n in range(numberofmatches):
+
+            best_n_documents.append(indexedwCD[n][0])
+            
+        
+        return best_n_documents
+
+
+