You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

341 lines
11 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. # The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment.
  2. # TODO GPU Multithreading has to be implemented.
  3. # USAGE: Learn scikit-learn count vectorizer on a database of lines or docs.
  4. from sklearn.externals import joblib
  5. from sklearn.feature_extraction.text import CountVectorizer
  6. import numpy as np
  7. import scipy as sc
  8. import tensorflow as tf
  9. import _pickle as cPickle
  10. import hickle as hkl
  11. import os
  12. # Define function to convert scipy csr matrix to tf tensor for working on gpu
  13. def convert_sparse_matrix_to_sparse_tensor(X):
  14. coo = sc.sparse.coo_matrix(X)
  15. indices = np.mat([coo.row, coo.col]).transpose()
  16. return tf.SparseTensorValue(indices, coo.data, coo.shape)
  17. # The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id
  18. ## in every list element of the input, each document is represented by one string
  19. # This list must be saved as a hkl dump and then loaded into the database.
  20. def my_tokenizer(s):
  21. return s.split('\+')
  22. class FASTsearch(object):
  23. def __init__(self, DatabaseDir):
  24. self.DatabaseDir = DatabaseDir[:-4]
  25. database = []
  26. hkl_load = hkl.load(DatabaseDir)
  27. for element in hkl_load:
  28. #print('element',element)
  29. #print('joined element', ' '.join(element))
  30. database.append(' '.join(element))
  31. # input has to be hkl format
  32. self.database = database
  33. def Gen_BoW_Model(self, max_features, analyzer, punctuation = False):
  34. print("Creating the bag of words...\n")
  35. from sklearn.feature_extraction.text import CountVectorizer
  36. # Initialize the "CountVectorizer" object, which is scikit-learn's
  37. # bag of words tool.
  38. if punctuation == False:
  39. vectorizer = CountVectorizer(analyzer = analyzer, \
  40. tokenizer = None, \
  41. preprocessor = None, \
  42. stop_words = None, \
  43. max_features = max_features)
  44. if punctuation == True:
  45. vectorizer = CountVectorizer(analyzer = analyzer, \
  46. tokenizer = my_tokenizer, \
  47. preprocessor = None, \
  48. stop_words = None, \
  49. max_features = max_features)
  50. # token_pattern = r'(?u)\w')
  51. # fit_transform() does two functions: First, it fits the model
  52. # and learns the vocabulary; second, it transforms our training data
  53. # into feature vectors. The input to fit_transform should be a list of
  54. # strings.
  55. train_data_features = vectorizer.fit_transform(self.database)
  56. joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl')
  57. print('dumping the data to hkl format..')
  58. hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip')
  59. print('done')
  60. return vectorizer
  61. def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir):
  62. # input has to be pkl format
  63. self.vectorizer = joblib.load(BoWModelDir)
  64. self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32')
  65. return self.vectorizer
  66. # input: string to search for in the documents, the numberofmatches to get the best n documents
  67. # output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number]
  68. def search(self, string , numberofmatches):
  69. numberofmatches = numberofmatches
  70. # Convert user input to Zeros and Ones
  71. user_array = []
  72. user_array.append(string)
  73. user_input_OnesZeros = self.vectorizer.transform(user_array)
  74. uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
  75. uiOZ = uOZ[np.newaxis, :]
  76. uiOZ = uiOZ.transpose()
  77. sess = tf.Session()
  78. with tf.device('/gpu:0'):
  79. with sess.as_default():
  80. uiOZ_tensor = tf.constant(uiOZ)
  81. dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
  82. #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
  83. #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
  84. #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
  85. wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
  86. wCD = np.array(wordCountDoku.eval())
  87. indexedwCD = []
  88. for n in range(len(wCD)):
  89. indexedwCD.append([n,wCD[n][0]])
  90. indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
  91. best_n_documents = []
  92. eq_number = 0
  93. for number in uiOZ:
  94. #print(number)
  95. eq_number += number ** 2
  96. #print(eq_number)
  97. n = 0
  98. done = False
  99. while n < len(indexedwCD) and done == False:
  100. n += 1
  101. if indexedwCD[n][1] == eq_number:
  102. best_n_documents = indexedwCD[n][0]
  103. done = True
  104. if indexedwCD[n][1] < eq_number:
  105. best_n_documents = indexedwCD[n - 1][0]
  106. done = True
  107. #for n in range(numberofmatches):
  108. #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
  109. return best_n_documents, indexedwCD[0]
  110. def search_with_highest_multiplikation_Output(self, string , numberofmatches):
  111. numberofmatches = numberofmatches
  112. # Convert user input to Zeros and Ones
  113. user_array = []
  114. user_array.append(string)
  115. user_input_OnesZeros = self.vectorizer.transform(user_array)
  116. uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
  117. uiOZ = uOZ[np.newaxis, :]
  118. uiOZ = uiOZ.transpose()
  119. sess = tf.Session()
  120. with tf.device('/gpu:0'):
  121. with sess.as_default():
  122. uiOZ_tensor = tf.constant(uiOZ)
  123. dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
  124. #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
  125. #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
  126. #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
  127. wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
  128. wCD = np.array(wordCountDoku.eval())
  129. indexedwCD = []
  130. for n in range(len(wCD)):
  131. indexedwCD.append([n,wCD[n][0]])
  132. indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
  133. best_n_documents = []
  134. for n in range(numberofmatches):
  135. best_n_documents.append(indexedwCD[n][0])
  136. return best_n_documents, indexedwCD[0]
  137. def searchPatternMatch(self, string , numberofmatches):
  138. numberofmatches = numberofmatches
  139. # Convert user input to Zeros and Ones
  140. user_array = []
  141. user_array.append(string)
  142. user_input_OnesZeros = self.vectorizer.transform(user_array)
  143. uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
  144. uiOZ = uOZ[np.newaxis, :]
  145. uiOZ = uiOZ.transpose()
  146. sess = tf.Session()
  147. with tf.device('/gpu:0'):
  148. with sess.as_default():
  149. uiOZ_tensor = tf.constant(uiOZ)
  150. dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
  151. #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
  152. #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
  153. #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
  154. wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
  155. wCD = np.array(wordCountDoku.eval())
  156. indexedwCD = []
  157. for n in range(len(wCD)):
  158. indexedwCD.append([n,wCD[n][0]])
  159. # Sort the biggest matches
  160. indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
  161. best_n_documents = []
  162. best_docs_surrounding = []
  163. # Get the number which is result when same words would be in the document as in one grammar scheme
  164. eq_number = 0
  165. for number in uiOZ:
  166. #print(number)
  167. eq_number += number ** 2
  168. print(eq_number)
  169. # Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so)
  170. n = 0
  171. done = False
  172. while n < len(indexedwCD) and done == False:
  173. n += 1
  174. #print('a',indexedwCD)
  175. #print('oo', indexedwCD[n])
  176. if indexedwCD[n][1] == eq_number:
  177. best_docs_surrounding.append(indexedwCD[n][0])
  178. #if indexedwCD[n][1] < eq_number:
  179. #best_docs_surrounding.append(indexedwCD[n][0])
  180. if indexedwCD[n][1] < eq_number :
  181. done = True
  182. # Count for these docs in surrounding the matches of wordnumbers per word
  183. # would be much faster when using the sparse class
  184. best_docs_surrounding_new = []
  185. for doc in best_docs_surrounding:
  186. dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False)
  187. Number_equal_words = 0
  188. for n in range(len(uiOZ)):
  189. #print(uiOZ[n])
  190. #print(dok_BoW[n])
  191. #print('dok_BoW',dok_BoW)
  192. if uiOZ[n] == dok_BoW[n]:
  193. Number_equal_words += 1
  194. best_docs_surrounding_new.append([doc , Number_equal_words])
  195. # Sort the result again with the original indexes
  196. best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True)
  197. #for n in range(numberofmatches):
  198. #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
  199. return best_n_documents