You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

171 lines
3.3 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. # prototype User Interface
  2. from sklearn.externals import joblib
  3. from sklearn.feature_extraction.text import CountVectorizer
  4. import numpy as np
  5. import scipy as sc
  6. import tensorflow as tf
  7. import _pickle as cPickle
  8. import hickle as hkl
  9. import os
  10. # Define function to convert scipy csr matrix to tf tensor for working on gpu
  11. def convert_sparse_matrix_to_sparse_tensor(X):
  12. coo = sc.sparse.coo_matrix(X)
  13. indices = np.mat([coo.row, coo.col]).transpose()
  14. return tf.SparseTensorValue(indices, coo.data, coo.shape)
  15. #Load the zeros and ones from the database
  16. dbOZ = hkl.load('bagofwords/OnesZerosDB_gzip.hkl')
  17. print(len(dbOZ))
  18. # transpose with csr transpose
  19. #dbOZ = dbOZ.transpose()
  20. # as type with csr as type
  21. dbOZ = dbOZ.astype('float32')
  22. #print(type(convert_sparse_matrix_to_sparse_tensor(dbOZ)))
  23. #print('bla',dbOZ)
  24. #dbOZ = dbOZ.transpose()
  25. #dbOZ = np.transpose(np.array(dbOZ).astype(np.float32, copy=False))
  26. #print('bla',dbOZ)
  27. #dbOZ.transpose()
  28. # Get the user input
  29. user_input_words = input("Please describe your problem: ")
  30. user_input_n = int(input("How many dokuments would you like to display?: "))
  31. # Convert user input to Zeros and Ones
  32. user_array = []
  33. user_array.append(user_input_words)
  34. print(user_array)
  35. from nltk.stem.snowball import SnowballStemmer
  36. stemmer = SnowballStemmer("german")
  37. user_array = [stemmer.stem(word) for word in user_array]
  38. print(user_array)
  39. vectorizer = joblib.load('bagofwords/bagofwords.pkl')
  40. user_input_OnesZeros = vectorizer.transform(user_array)
  41. uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
  42. #uiOZ = uOZ
  43. #uiOZ = np.transpose(uOZ[np.newaxis, :])
  44. uiOZ = uOZ[np.newaxis, :]
  45. uiOZ = uiOZ.transpose()
  46. #print('1', uiOZ)
  47. #print('2', dbOZ)
  48. sess = tf.Session()
  49. with sess.as_default():
  50. uiOZ_tensor = tf.constant(uiOZ)
  51. dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ)
  52. #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
  53. #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
  54. #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
  55. wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
  56. wCD = np.array(wordCountDoku.eval())
  57. indexedwCD = []
  58. for n in range(len(wCD)):
  59. indexedwCD.append([n,wCD[n][0]])
  60. #print('0',indexedwCD)
  61. #indexedwCD = np.transpose(np.array(indexedwCD))
  62. #print('1',indexedwCD)
  63. indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
  64. #print('2', indexedwCD)
  65. #print('2',indexedwCD[::-1])
  66. #print(indexedwCD)
  67. #print('Here come the fuckn best %d dokument/s that match your problem:' %(user_input_n))
  68. best_n_documents = []
  69. for n in range(user_input_n):
  70. #print(indexedwCD[n][0])
  71. best_n_documents.append(indexedwCD[n][0])
  72. #print(best_n_documents)
  73. cwd = os.getcwd()
  74. #rechtsprechIn = os.listdir(cwd + '/' + 'EndDokumente')
  75. rechtsprechIn = hkl.load('bagofwords/rechtsprechIn_gzip.hkl')
  76. #print(rechtsprechIn)
  77. from subprocess import call
  78. for n in range(user_input_n):
  79. call(['nano', cwd + '/' + 'EndDokumente/' + rechtsprechIn[int(best_n_documents[n])]])
  80. # Calculate the best matching parallelized with tf
  81. # Get the id of documents which fit the best
  82. # Display the n best matching dokuments