first commit

2021-09-24 15:13:03 +02:00 · 2021-09-24 15:13:03 +02:00 · be4279eb64
commit be4279eb64
parent 51df51764c
4 changed files with 399 additions and 1 deletions
--- a/build/tf-gpu-FASTsearch/Dockerfile
+++ b/build/tf-gpu-FASTsearch/Dockerfile
@ -0,0 +1,46 @@
+FROM tensorflow/tensorflow:1.12.0-gpu
+
+COPY Prototyp /home/Prototyp
+
+COPY requis.txt /home/requis.txt
+
+RUN apt-get update && apt-get install -y wget libssl-dev openssl
+#RUN wget https://www.python.org/ftp/python/3.5.3/Python-3.5.3.tgz
+#RUN tar -xzvf Python-3.5.3.tgz
+#RUN cd Python-3.5.3 && ./configure && make && make install
+
+RUN python --version
+
+RUN apt-get update && apt-get install -y virtualenv python-dev python-pip build-essential
+
+#RUN python3.5 -m venv /home/venv
+
+#ENV PATH="home/venv/bin:$PATH"
+
+RUN python --version
+
+#RUN pip3 install --upgrade pip
+
+RUN pip install -r /home/requis.txt && python -m spacy download de
+
+RUN pip install hickle==3.4.9 Twisted joblib
+#nodejs npm
+
+#RUN python -m pip install incremental
+
+#RUN python -m pip install cffi
+
+#RUN python -m pip install -r /home/requis.txt
+
+#RUN python3 -m spacy download de
+
+#RUN pip3 install pandas bs4
+
+
+
+RUN apt-get update && apt-get install -y nodejs
+
+#ENTRYPOINT ["tail"]
+#CMD ["-f","/dev/null"]
+
+CMD /bin/sh -c "cd /home/Prototyp && nodejs server.js" 
--- a/build/tf-gpu-FASTsearch/FASTsearch.py
+++ b/build/tf-gpu-FASTsearch/FASTsearch.py
@ -0,0 +1,341 @@
+
+# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment.
+
+# TODO GPU Multithreading has to be implemented.
+
+
+
+# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. 
+
+import joblib
+from sklearn.feature_extraction.text import CountVectorizer
+
+import numpy as np
+import scipy as sc
+
+import tensorflow as tf
+
+
+import _pickle as cPickle
+
+import hickle as hkl
+
+import os
+
+
+# Define function to convert scipy csr matrix to tf tensor for working on gpu
+def convert_sparse_matrix_to_sparse_tensor(X):
+    coo = sc.sparse.coo_matrix(X)
+    indices = np.mat([coo.row, coo.col]).transpose()
+    return tf.SparseTensorValue(indices, coo.data, coo.shape)
+
+
+
+
+# The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id
+## in every list element of the input, each document is represented by one string
+# This list must be saved as a hkl dump and then loaded into the database.
+
+
+def my_tokenizer(s):
+    return s.split('\+')
+
+class FASTsearch(object):
+    
+    def __init__(self, DatabaseDir):
+        
+        self.DatabaseDir = DatabaseDir[:-4]
+        
+        database = []
+        hkl_load = hkl.load(DatabaseDir)
+        
+        for element in hkl_load:
+            #print('element',element)
+            #print('joined element', ' '.join(element))
+            database.append(' '.join(element))
+            
+        
+        # input has to be hkl format
+        self.database = database
+        
+        
+        
+        
+    def Gen_BoW_Model(self, max_features, analyzer, punctuation = False):
+        
+        print("Creating the bag of words...\n")
+        from sklearn.feature_extraction.text import CountVectorizer
+
+        # Initialize the "CountVectorizer" object, which is scikit-learn's
+        # bag of words tool.  
+        if punctuation == False:
+            vectorizer = CountVectorizer(analyzer = analyzer,   \
+                                        tokenizer = None,    \
+                                        preprocessor = None, \
+                                        stop_words = None,   \
+                                        max_features = max_features)
+            
+        if punctuation == True:
+            vectorizer = CountVectorizer(analyzer = analyzer,   \
+                                        tokenizer = my_tokenizer,    \
+                                        preprocessor = None, \
+                                        stop_words = None,   \
+                                        max_features = max_features)
+                                        
+
+        # token_pattern = r'(?u)\w')
+        # fit_transform() does two functions: First, it fits the model
+        # and learns the vocabulary; second, it transforms our training data
+        # into feature vectors. The input to fit_transform should be a list of 
+        # strings.
+        train_data_features = vectorizer.fit_transform(self.database)
+        
+        joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl')
+        
+        print('dumping the data to hkl format..')
+        hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip')
+        print('done')
+
+        
+        return vectorizer
+    
+    def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir):
+        
+        # input has to be pkl format
+        self.vectorizer = joblib.load(BoWModelDir)
+        
+        self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32')
+        
+        
+        return self.vectorizer
+    
+    
+    # input: string to search for in the documents, the numberofmatches to get the best n documents
+    # output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number]
+    
+    def search(self, string , numberofmatches):
+        
+        
+        numberofmatches = numberofmatches
+        
+        
+        # Convert user input to Zeros and Ones
+        user_array = []
+        user_array.append(string)
+        
+        user_input_OnesZeros = self.vectorizer.transform(user_array)
+        
+        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
+        
+        
+        uiOZ = uOZ[np.newaxis, :]
+
+        uiOZ = uiOZ.transpose()
+
+        sess = tf.Session()
+        with tf.device('/gpu:0'):
+            with sess.as_default():
+
+                uiOZ_tensor = tf.constant(uiOZ)
+
+                dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
+
+                #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
+                #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
+
+
+                #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)
+                wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
+
+                wCD = np.array(wordCountDoku.eval())
+            
+        indexedwCD = []
+        for n in range(len(wCD)):
+            indexedwCD.append([n,wCD[n][0]])
+
+
+        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
+
+        best_n_documents = []
+
+        eq_number = 0
+        for number in uiOZ:
+            #print(number)
+            eq_number += number ** 2
+        
+        #print(eq_number)
+        
+        n = 0
+        done = False
+        while n < len(indexedwCD) and done == False:
+            n += 1
+            if indexedwCD[n][1] == eq_number:
+                best_n_documents = indexedwCD[n][0]
+                done = True
+                
+            if indexedwCD[n][1] < eq_number:
+                best_n_documents = indexedwCD[n - 1][0]
+                done = True
+            
+            
+        #for n in range(numberofmatches):
+
+            #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
+            
+        
+        return best_n_documents, indexedwCD[0]
+    
+    def search_with_highest_multiplikation_Output(self, string , numberofmatches):
+        
+        
+        
+        
+        numberofmatches = numberofmatches
+        
+        
+        # Convert user input to Zeros and Ones
+        user_array = []
+        user_array.append(string)
+        
+        user_input_OnesZeros = self.vectorizer.transform(user_array)
+        
+        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
+        
+        
+        uiOZ = uOZ[np.newaxis, :]
+
+        uiOZ = uiOZ.transpose()
+
+        sess = tf.Session()
+        with tf.device('/gpu:0'):
+            with sess.as_default():
+
+                uiOZ_tensor = tf.constant(uiOZ)
+
+                dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
+
+                #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
+                #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
+
+
+                #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)
+                wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
+
+                wCD = np.array(wordCountDoku.eval())
+            
+        indexedwCD = []
+        for n in range(len(wCD)):
+            indexedwCD.append([n,wCD[n][0]])
+
+
+        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
+
+        best_n_documents = []
+
+        for n in range(numberofmatches):
+            best_n_documents.append(indexedwCD[n][0])
+        
+        return best_n_documents, indexedwCD[0]
+    
+    
+    def searchPatternMatch(self, string , numberofmatches):
+        
+        
+        numberofmatches = numberofmatches
+        
+        
+        # Convert user input to Zeros and Ones
+        user_array = []
+        user_array.append(string)
+        
+        user_input_OnesZeros = self.vectorizer.transform(user_array)
+        
+        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
+        
+        
+        uiOZ = uOZ[np.newaxis, :]
+
+        uiOZ = uiOZ.transpose()
+
+        sess = tf.Session()
+        with tf.device('/gpu:0'):
+            with sess.as_default():
+
+                uiOZ_tensor = tf.constant(uiOZ)
+
+                dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
+
+                #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
+                #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
+
+
+                #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)
+                wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
+
+                wCD = np.array(wordCountDoku.eval())
+            
+        indexedwCD = []
+        for n in range(len(wCD)):
+            indexedwCD.append([n,wCD[n][0]])
+
+        # Sort the biggest matches
+        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
+
+        best_n_documents = []
+        
+        best_docs_surrounding = []
+        
+        
+        # Get the number which is result when same words would be in the document as in one grammar scheme
+        eq_number = 0
+        for number in uiOZ:
+            #print(number)
+            eq_number += number ** 2
+        
+        print(eq_number)
+        
+        # Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so)
+        n = 0
+        done = False
+        while n < len(indexedwCD) and done == False:
+            n += 1
+            #print('a',indexedwCD)
+            #print('oo', indexedwCD[n])
+            if indexedwCD[n][1] == eq_number:
+                best_docs_surrounding.append(indexedwCD[n][0])
+                
+            #if indexedwCD[n][1] < eq_number:
+                #best_docs_surrounding.append(indexedwCD[n][0])
+            
+            if indexedwCD[n][1] < eq_number  :
+                done = True
+        
+        
+        # Count for these docs in surrounding the matches of wordnumbers per word
+        # would be much faster when using the sparse class
+        
+        best_docs_surrounding_new = []
+        for doc in best_docs_surrounding:
+            dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False)
+            Number_equal_words = 0
+            for n in range(len(uiOZ)):
+                #print(uiOZ[n])
+                #print(dok_BoW[n])
+                #print('dok_BoW',dok_BoW)
+                if uiOZ[n] == dok_BoW[n]:
+                    Number_equal_words += 1
+            best_docs_surrounding_new.append([doc , Number_equal_words])
+        
+        # Sort the result again with the original indexes
+        best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True)
+        
+        
+        
+        #for n in range(numberofmatches):
+
+            #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
+            
+        
+        return best_n_documents
+
+
--- a/compose/docker-compose.yml
+++ b/compose/docker-compose.yml
@ -0,0 +1,12 @@
+version: '2.3'
+        
+        
+services:
+                
+        prototype:
+                        
+                build: ../build/tf-gpu-Prototyp
+                container_name: prototype
+                restart: always
+                ports:
+                        - "127.0.0.1:7000:7000"
--- a/1
+++ b/1
@ -1 +0,0 @@
-ölaksfd