You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

190 lines
6.0 KiB

4 years ago
  1. """Count occurrences of uint64-valued keys."""
  2. from __future__ import division
  3. cimport cython
  4. from libc.math cimport log, exp, sqrt
  5. cdef class PreshCounter:
  6. def __init__(self, initial_size=8):
  7. assert initial_size != 0
  8. assert initial_size & (initial_size - 1) == 0
  9. self.mem = Pool()
  10. self.c_map = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
  11. map_init(self.mem, self.c_map, initial_size)
  12. self.smoother = None
  13. self.total = 0
  14. property length:
  15. def __get__(self):
  16. return self.c_map.length
  17. def __len__(self):
  18. return self.c_map.length
  19. def __iter__(self):
  20. cdef int i
  21. for i in range(self.c_map.length):
  22. if self.c_map.cells[i].key != 0:
  23. yield (self.c_map.cells[i].key, <count_t>self.c_map.cells[i].value)
  24. def __getitem__(self, key_t key):
  25. return <count_t>map_get(self.c_map, key)
  26. cpdef int inc(self, key_t key, count_t inc) except -1:
  27. cdef count_t c = <count_t>map_get(self.c_map, key)
  28. c += inc
  29. map_set(self.mem, self.c_map, key, <void*>c)
  30. self.total += inc
  31. return c
  32. def prob(self, key_t key):
  33. cdef GaleSmoother smoother
  34. cdef void* value = map_get(self.c_map, key)
  35. if self.smoother is not None:
  36. smoother = self.smoother
  37. r_star = self.smoother(<count_t>value)
  38. return r_star / self.smoother.total
  39. elif value == NULL:
  40. return 0
  41. else:
  42. return <count_t>value / self.total
  43. def smooth(self):
  44. self.smoother = GaleSmoother(self)
  45. cdef class GaleSmoother:
  46. cdef Pool mem
  47. cdef count_t* Nr
  48. cdef double gradient
  49. cdef double intercept
  50. cdef readonly count_t cutoff
  51. cdef count_t Nr0
  52. cdef readonly double total
  53. def __init__(self, PreshCounter counts):
  54. count_counts = PreshCounter()
  55. cdef double total = 0
  56. for _, count in counts:
  57. count_counts.inc(count, 1)
  58. total += count
  59. # If we have no items seen 1 or 2 times, this doesn't work. But, this
  60. # won't be true in real data...
  61. assert count_counts[1] != 0 and count_counts[2] != 0, "Cannot smooth your weird data"
  62. # Extrapolate Nr0 from Nr1 and Nr2.
  63. self.Nr0 = count_counts[1] + (count_counts[1] - count_counts[2])
  64. self.mem = Pool()
  65. cdef double[2] mb
  66. cdef int n_counts = 0
  67. for _ in count_counts:
  68. n_counts += 1
  69. sorted_r = <count_t*>count_counts.mem.alloc(n_counts, sizeof(count_t))
  70. self.Nr = <count_t*>self.mem.alloc(n_counts, sizeof(count_t))
  71. for i, (count, count_count) in enumerate(sorted(count_counts)):
  72. sorted_r[i] = count
  73. self.Nr[i] = count_count
  74. _fit_loglinear_model(mb, sorted_r, self.Nr, n_counts)
  75. self.cutoff = _find_when_to_switch(sorted_r, self.Nr, mb[0], mb[1],
  76. n_counts)
  77. self.gradient = mb[0]
  78. self.intercept = mb[1]
  79. self.total = self(0) * self.Nr0
  80. for count, count_count in count_counts:
  81. self.total += self(count) * count_count
  82. def __call__(self, count_t r):
  83. if r == 0:
  84. return self.Nr[1] / self.Nr0
  85. elif r < self.cutoff:
  86. return turing_estimate_of_r(<double>r, <double>self.Nr[r-1], <double>self.Nr[r])
  87. else:
  88. return gale_estimate_of_r(<double>r, self.gradient, self.intercept)
  89. def count_count(self, count_t r):
  90. if r == 0:
  91. return self.Nr0
  92. else:
  93. return self.Nr[r-1]
  94. @cython.cdivision(True)
  95. cdef double turing_estimate_of_r(double r, double Nr, double Nr1) except -1:
  96. return ((r + 1) * Nr1) / Nr
  97. @cython.cdivision(True)
  98. cdef double gale_estimate_of_r(double r, double gradient, double intercept) except -1:
  99. cdef double e_nr = exp(gradient * log(r) + intercept)
  100. cdef double e_nr1 = exp(gradient * log(r+1) + intercept)
  101. return (r + 1) * (e_nr1 / e_nr)
  102. @cython.cdivision(True)
  103. cdef void _fit_loglinear_model(double* output, count_t* sorted_r, count_t* Nr,
  104. int length) except *:
  105. cdef double x_mean = 0.0
  106. cdef double y_mean = 0.0
  107. cdef Pool mem = Pool()
  108. x = <double*>mem.alloc(length, sizeof(double))
  109. y = <double*>mem.alloc(length, sizeof(double))
  110. cdef int i
  111. for i in range(length):
  112. r = sorted_r[i]
  113. x[i] = log(<double>r)
  114. y[i] = log(<double>_get_zr(i, sorted_r, Nr[i], length))
  115. x_mean += x[i]
  116. y_mean += y[i]
  117. x_mean /= length
  118. y_mean /= length
  119. cdef double ss_xy = 0.0
  120. cdef double ss_xx = 0.0
  121. for i in range(length):
  122. x_dist = x[i] - x_mean
  123. y_dist = y[i] - y_mean
  124. # SS_xy = sum the product of the distances from the mean
  125. ss_xy += x_dist * y_dist
  126. # SS_xx = sum the squares of the x distance
  127. ss_xx += x_dist * x_dist
  128. # Gradient
  129. output[0] = ss_xy / ss_xx
  130. # Intercept
  131. output[1] = y_mean - output[0] * x_mean
  132. @cython.cdivision(True)
  133. cdef double _get_zr(int j, count_t* sorted_r, count_t Nr_j, int n_counts) except -1:
  134. cdef double r_i = sorted_r[j-1] if j >= 1 else 0
  135. cdef double r_j = sorted_r[j]
  136. cdef double r_k = sorted_r[j+1] if (j+1) < n_counts else (2 * r_i - 1)
  137. return 2 * Nr_j / (r_k - r_i)
  138. @cython.cdivision(True)
  139. cdef double _variance(double r, double Nr, double Nr1) nogil:
  140. return 1.96 * sqrt((r+1)**2 * (Nr1 / Nr**2) * (1.0 + (Nr1 / Nr)))
  141. @cython.cdivision(True)
  142. cdef count_t _find_when_to_switch(count_t* sorted_r, count_t* Nr, double m, double b,
  143. int length) except -1:
  144. cdef int i
  145. cdef count_t r
  146. for i in range(length-1):
  147. r = sorted_r[i]
  148. if sorted_r[i+1] != r+1:
  149. return r
  150. g_r = gale_estimate_of_r(r, m, b)
  151. t_r = turing_estimate_of_r(<double>r, <double>Nr[i], <double>Nr[i+1])
  152. if abs(t_r - g_r) <= _variance(<double>r, <double>Nr[i], <double>Nr[i+1]):
  153. return r
  154. else:
  155. return length - 1