2021-09-24 15:13:03 +02:00
# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment.
# TODO GPU Multithreading has to be implemented.
# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs.
import joblib
from sklearn . feature_extraction . text import CountVectorizer
import numpy as np
import scipy as sc
2021-10-18 18:22:03 +02:00
import tensorflow . compat . v1 as tf
2021-09-24 15:13:03 +02:00
2021-10-18 18:22:03 +02:00
tf . compat . v1 . disable_eager_execution ( )
2021-09-24 15:13:03 +02:00
import _pickle as cPickle
import hickle as hkl
import os
# Define function to convert scipy csr matrix to tf tensor for working on gpu
def convert_sparse_matrix_to_sparse_tensor ( X ) :
coo = sc . sparse . coo_matrix ( X )
indices = np . mat ( [ coo . row , coo . col ] ) . transpose ( )
return tf . SparseTensorValue ( indices , coo . data , coo . shape )
# The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id
## in every list element of the input, each document is represented by one string
# This list must be saved as a hkl dump and then loaded into the database.
def my_tokenizer ( s ) :
return s . split ( ' \ + ' )
class FASTsearch ( object ) :
def __init__ ( self , DatabaseDir ) :
self . DatabaseDir = DatabaseDir [ : - 4 ]
database = [ ]
hkl_load = hkl . load ( DatabaseDir )
for element in hkl_load :
#print('element',element)
#print('joined element', ' '.join(element))
database . append ( ' ' . join ( element ) )
# input has to be hkl format
self . database = database
def Gen_BoW_Model ( self , max_features , analyzer , punctuation = False ) :
print ( " Creating the bag of words... \n " )
from sklearn . feature_extraction . text import CountVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
if punctuation == False :
vectorizer = CountVectorizer ( analyzer = analyzer , \
tokenizer = None , \
preprocessor = None , \
stop_words = None , \
max_features = max_features )
if punctuation == True :
vectorizer = CountVectorizer ( analyzer = analyzer , \
tokenizer = my_tokenizer , \
preprocessor = None , \
stop_words = None , \
max_features = max_features )
# token_pattern = r'(?u)\w')
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
train_data_features = vectorizer . fit_transform ( self . database )
joblib . dump ( vectorizer , ' bagofwords ' + self . DatabaseDir + ' .pkl ' )
print ( ' dumping the data to hkl format.. ' )
hkl . dump ( train_data_features , ' DataBaseOneZeros ' + self . DatabaseDir + ' .hkl ' , mode = ' w ' , compression = ' gzip ' )
print ( ' done ' )
return vectorizer
def Load_BoW_Model ( self , BoWModelDir , DatabaseOneZerosDir ) :
# input has to be pkl format
self . vectorizer = joblib . load ( BoWModelDir )
self . dbOZ = hkl . load ( DatabaseOneZerosDir ) . astype ( ' float32 ' )
return self . vectorizer
# input: string to search for in the documents, the numberofmatches to get the best n documents
# output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number]
def search ( self , string , numberofmatches ) :
numberofmatches = numberofmatches
# Convert user input to Zeros and Ones
user_array = [ ]
user_array . append ( string )
user_input_OnesZeros = self . vectorizer . transform ( user_array )
uOZ = user_input_OnesZeros . toarray ( ) [ 0 ] . astype ( np . float32 , copy = False )
uiOZ = uOZ [ np . newaxis , : ]
uiOZ = uiOZ . transpose ( )
sess = tf . Session ( )
with tf . device ( ' /gpu:0 ' ) :
with sess . as_default ( ) :
uiOZ_tensor = tf . constant ( uiOZ )
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor ( self . dbOZ )
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
wordCountDoku = tf . sparse_tensor_dense_matmul ( dbOZ_tensor_sparse , uiOZ_tensor )
wCD = np . array ( wordCountDoku . eval ( ) )
indexedwCD = [ ]
for n in range ( len ( wCD ) ) :
indexedwCD . append ( [ n , wCD [ n ] [ 0 ] ] )
indexedwCD = sorted ( indexedwCD [ : : - 1 ] , key = lambda tup : tup [ 1 ] , reverse = True )
best_n_documents = [ ]
eq_number = 0
for number in uiOZ :
#print(number)
eq_number + = number * * 2
#print(eq_number)
n = 0
done = False
while n < len ( indexedwCD ) and done == False :
n + = 1
if indexedwCD [ n ] [ 1 ] == eq_number :
best_n_documents = indexedwCD [ n ] [ 0 ]
done = True
if indexedwCD [ n ] [ 1 ] < eq_number :
best_n_documents = indexedwCD [ n - 1 ] [ 0 ]
done = True
#for n in range(numberofmatches):
#best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
return best_n_documents , indexedwCD [ 0 ]
def search_with_highest_multiplikation_Output ( self , string , numberofmatches ) :
numberofmatches = numberofmatches
# Convert user input to Zeros and Ones
user_array = [ ]
user_array . append ( string )
user_input_OnesZeros = self . vectorizer . transform ( user_array )
uOZ = user_input_OnesZeros . toarray ( ) [ 0 ] . astype ( np . float32 , copy = False )
uiOZ = uOZ [ np . newaxis , : ]
uiOZ = uiOZ . transpose ( )
sess = tf . Session ( )
with tf . device ( ' /gpu:0 ' ) :
with sess . as_default ( ) :
uiOZ_tensor = tf . constant ( uiOZ )
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor ( self . dbOZ )
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
wordCountDoku = tf . sparse_tensor_dense_matmul ( dbOZ_tensor_sparse , uiOZ_tensor )
wCD = np . array ( wordCountDoku . eval ( ) )
indexedwCD = [ ]
for n in range ( len ( wCD ) ) :
indexedwCD . append ( [ n , wCD [ n ] [ 0 ] ] )
indexedwCD = sorted ( indexedwCD [ : : - 1 ] , key = lambda tup : tup [ 1 ] , reverse = True )
best_n_documents = [ ]
for n in range ( numberofmatches ) :
best_n_documents . append ( indexedwCD [ n ] [ 0 ] )
return best_n_documents , indexedwCD [ 0 ]
def searchPatternMatch ( self , string , numberofmatches ) :
numberofmatches = numberofmatches
# Convert user input to Zeros and Ones
user_array = [ ]
user_array . append ( string )
user_input_OnesZeros = self . vectorizer . transform ( user_array )
uOZ = user_input_OnesZeros . toarray ( ) [ 0 ] . astype ( np . float32 , copy = False )
uiOZ = uOZ [ np . newaxis , : ]
uiOZ = uiOZ . transpose ( )
sess = tf . Session ( )
with tf . device ( ' /gpu:0 ' ) :
with sess . as_default ( ) :
uiOZ_tensor = tf . constant ( uiOZ )
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor ( self . dbOZ )
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
wordCountDoku = tf . sparse_tensor_dense_matmul ( dbOZ_tensor_sparse , uiOZ_tensor )
wCD = np . array ( wordCountDoku . eval ( ) )
indexedwCD = [ ]
for n in range ( len ( wCD ) ) :
indexedwCD . append ( [ n , wCD [ n ] [ 0 ] ] )
# Sort the biggest matches
indexedwCD = sorted ( indexedwCD [ : : - 1 ] , key = lambda tup : tup [ 1 ] , reverse = True )
best_n_documents = [ ]
best_docs_surrounding = [ ]
# Get the number which is result when same words would be in the document as in one grammar scheme
eq_number = 0
for number in uiOZ :
#print(number)
eq_number + = number * * 2
print ( eq_number )
# Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so)
n = 0
done = False
while n < len ( indexedwCD ) and done == False :
n + = 1
#print('a',indexedwCD)
#print('oo', indexedwCD[n])
if indexedwCD [ n ] [ 1 ] == eq_number :
best_docs_surrounding . append ( indexedwCD [ n ] [ 0 ] )
#if indexedwCD[n][1] < eq_number:
#best_docs_surrounding.append(indexedwCD[n][0])
if indexedwCD [ n ] [ 1 ] < eq_number :
done = True
# Count for these docs in surrounding the matches of wordnumbers per word
# would be much faster when using the sparse class
best_docs_surrounding_new = [ ]
for doc in best_docs_surrounding :
dok_BoW = self . dbOZ [ doc ] . toarray ( ) [ 0 ] . astype ( np . float32 , copy = False )
Number_equal_words = 0
for n in range ( len ( uiOZ ) ) :
#print(uiOZ[n])
#print(dok_BoW[n])
#print('dok_BoW',dok_BoW)
if uiOZ [ n ] == dok_BoW [ n ] :
Number_equal_words + = 1
best_docs_surrounding_new . append ( [ doc , Number_equal_words ] )
# Sort the result again with the original indexes
best_n_documents = sorted ( best_docs_surrounding_new [ : : - 1 ] , key = lambda tup : tup [ 1 ] , reverse = True )
#for n in range(numberofmatches):
#best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
return best_n_documents