You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 lines
2.6 KiB

  1. # The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment.
  2. # TODO GPU Multithreading has to be implemented.
  3. # USAGE: Learn scikit-learn count vectorizer on a database of lines or docs.
  4. from sklearn.externals import joblib
  5. from sklearn.feature_extraction.text import CountVectorizer
  6. import numpy as np
  7. import scipy as sc
  8. import tensorflow as tf
  9. import _pickle as cPickle
  10. import hickle as hkl
  11. import os
  12. # Define function to convert scipy csr matrix to tf tensor for working on gpu
  13. def convert_sparse_matrix_to_sparse_tensor(X):
  14. coo = sc.sparse.coo_matrix(X)
  15. indices = np.mat([coo.row, coo.col]).transpose()
  16. return tf.SparseTensorValue(indices, coo.data, coo.shape)
  17. class FASTsearch(object):
  18. def __init__(self, DatabaseDir, BoWModelDir):
  19. # input has to be hkl format
  20. self.database = hkl.load(DatabaseDir).astype('float32')
  21. # input has to be pkl format
  22. self.vectorizer = joblib.load(BoWModelDir)
  23. def search(self, string , numberofmatches):
  24. numberofmatches = numberofmatches
  25. # Convert user input to Zeros and Ones
  26. user_array = []
  27. user_array.append(string)
  28. user_input_OnesZeros = self.vectorizer.transform(user_array)
  29. uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
  30. uiOZ = uOZ[np.newaxis, :]
  31. uiOZ = uiOZ.transpose()
  32. sess = tf.Session()
  33. with sess.as_default():
  34. uiOZ_tensor = tf.constant(uiOZ)
  35. dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ)
  36. #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
  37. #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
  38. #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
  39. wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
  40. wCD = np.array(wordCountDoku.eval())
  41. indexedwCD = []
  42. for n in range(len(wCD)):
  43. indexedwCD.append([n,wCD[n][0]])
  44. indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
  45. best_n_documents = []
  46. for n in range(numberofmatches):
  47. best_n_documents.append(indexedwCD[n][0])
  48. return best_n_documents