190 lines
6 KiB
Cython
190 lines
6 KiB
Cython
"""Count occurrences of uint64-valued keys."""
|
|
from __future__ import division
|
|
cimport cython
|
|
from libc.math cimport log, exp, sqrt
|
|
|
|
|
|
cdef class PreshCounter:
|
|
def __init__(self, initial_size=8):
|
|
assert initial_size != 0
|
|
assert initial_size & (initial_size - 1) == 0
|
|
self.mem = Pool()
|
|
self.c_map = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
|
|
map_init(self.mem, self.c_map, initial_size)
|
|
self.smoother = None
|
|
self.total = 0
|
|
|
|
property length:
|
|
def __get__(self):
|
|
return self.c_map.length
|
|
|
|
def __len__(self):
|
|
return self.c_map.length
|
|
|
|
def __iter__(self):
|
|
cdef int i
|
|
for i in range(self.c_map.length):
|
|
if self.c_map.cells[i].key != 0:
|
|
yield (self.c_map.cells[i].key, <count_t>self.c_map.cells[i].value)
|
|
|
|
def __getitem__(self, key_t key):
|
|
return <count_t>map_get(self.c_map, key)
|
|
|
|
cpdef int inc(self, key_t key, count_t inc) except -1:
|
|
cdef count_t c = <count_t>map_get(self.c_map, key)
|
|
c += inc
|
|
map_set(self.mem, self.c_map, key, <void*>c)
|
|
self.total += inc
|
|
return c
|
|
|
|
def prob(self, key_t key):
|
|
cdef GaleSmoother smoother
|
|
cdef void* value = map_get(self.c_map, key)
|
|
if self.smoother is not None:
|
|
smoother = self.smoother
|
|
r_star = self.smoother(<count_t>value)
|
|
return r_star / self.smoother.total
|
|
elif value == NULL:
|
|
return 0
|
|
else:
|
|
return <count_t>value / self.total
|
|
|
|
def smooth(self):
|
|
self.smoother = GaleSmoother(self)
|
|
|
|
|
|
cdef class GaleSmoother:
|
|
cdef Pool mem
|
|
cdef count_t* Nr
|
|
cdef double gradient
|
|
cdef double intercept
|
|
cdef readonly count_t cutoff
|
|
cdef count_t Nr0
|
|
cdef readonly double total
|
|
|
|
def __init__(self, PreshCounter counts):
|
|
count_counts = PreshCounter()
|
|
cdef double total = 0
|
|
for _, count in counts:
|
|
count_counts.inc(count, 1)
|
|
total += count
|
|
# If we have no items seen 1 or 2 times, this doesn't work. But, this
|
|
# won't be true in real data...
|
|
assert count_counts[1] != 0 and count_counts[2] != 0, "Cannot smooth your weird data"
|
|
# Extrapolate Nr0 from Nr1 and Nr2.
|
|
self.Nr0 = count_counts[1] + (count_counts[1] - count_counts[2])
|
|
self.mem = Pool()
|
|
|
|
cdef double[2] mb
|
|
|
|
cdef int n_counts = 0
|
|
for _ in count_counts:
|
|
n_counts += 1
|
|
sorted_r = <count_t*>count_counts.mem.alloc(n_counts, sizeof(count_t))
|
|
self.Nr = <count_t*>self.mem.alloc(n_counts, sizeof(count_t))
|
|
for i, (count, count_count) in enumerate(sorted(count_counts)):
|
|
sorted_r[i] = count
|
|
self.Nr[i] = count_count
|
|
|
|
_fit_loglinear_model(mb, sorted_r, self.Nr, n_counts)
|
|
|
|
self.cutoff = _find_when_to_switch(sorted_r, self.Nr, mb[0], mb[1],
|
|
n_counts)
|
|
self.gradient = mb[0]
|
|
self.intercept = mb[1]
|
|
self.total = self(0) * self.Nr0
|
|
for count, count_count in count_counts:
|
|
self.total += self(count) * count_count
|
|
|
|
def __call__(self, count_t r):
|
|
if r == 0:
|
|
return self.Nr[1] / self.Nr0
|
|
elif r < self.cutoff:
|
|
return turing_estimate_of_r(<double>r, <double>self.Nr[r-1], <double>self.Nr[r])
|
|
else:
|
|
return gale_estimate_of_r(<double>r, self.gradient, self.intercept)
|
|
|
|
def count_count(self, count_t r):
|
|
if r == 0:
|
|
return self.Nr0
|
|
else:
|
|
return self.Nr[r-1]
|
|
|
|
|
|
@cython.cdivision(True)
|
|
cdef double turing_estimate_of_r(double r, double Nr, double Nr1) except -1:
|
|
return ((r + 1) * Nr1) / Nr
|
|
|
|
|
|
@cython.cdivision(True)
|
|
cdef double gale_estimate_of_r(double r, double gradient, double intercept) except -1:
|
|
cdef double e_nr = exp(gradient * log(r) + intercept)
|
|
cdef double e_nr1 = exp(gradient * log(r+1) + intercept)
|
|
return (r + 1) * (e_nr1 / e_nr)
|
|
|
|
|
|
@cython.cdivision(True)
|
|
cdef void _fit_loglinear_model(double* output, count_t* sorted_r, count_t* Nr,
|
|
int length) except *:
|
|
cdef double x_mean = 0.0
|
|
cdef double y_mean = 0.0
|
|
|
|
cdef Pool mem = Pool()
|
|
x = <double*>mem.alloc(length, sizeof(double))
|
|
y = <double*>mem.alloc(length, sizeof(double))
|
|
|
|
cdef int i
|
|
for i in range(length):
|
|
r = sorted_r[i]
|
|
x[i] = log(<double>r)
|
|
y[i] = log(<double>_get_zr(i, sorted_r, Nr[i], length))
|
|
x_mean += x[i]
|
|
y_mean += y[i]
|
|
|
|
x_mean /= length
|
|
y_mean /= length
|
|
|
|
cdef double ss_xy = 0.0
|
|
cdef double ss_xx = 0.0
|
|
|
|
for i in range(length):
|
|
x_dist = x[i] - x_mean
|
|
y_dist = y[i] - y_mean
|
|
# SS_xy = sum the product of the distances from the mean
|
|
ss_xy += x_dist * y_dist
|
|
# SS_xx = sum the squares of the x distance
|
|
ss_xx += x_dist * x_dist
|
|
# Gradient
|
|
output[0] = ss_xy / ss_xx
|
|
# Intercept
|
|
output[1] = y_mean - output[0] * x_mean
|
|
|
|
|
|
@cython.cdivision(True)
|
|
cdef double _get_zr(int j, count_t* sorted_r, count_t Nr_j, int n_counts) except -1:
|
|
cdef double r_i = sorted_r[j-1] if j >= 1 else 0
|
|
cdef double r_j = sorted_r[j]
|
|
cdef double r_k = sorted_r[j+1] if (j+1) < n_counts else (2 * r_i - 1)
|
|
return 2 * Nr_j / (r_k - r_i)
|
|
|
|
|
|
@cython.cdivision(True)
|
|
cdef double _variance(double r, double Nr, double Nr1) nogil:
|
|
return 1.96 * sqrt((r+1)**2 * (Nr1 / Nr**2) * (1.0 + (Nr1 / Nr)))
|
|
|
|
|
|
@cython.cdivision(True)
|
|
cdef count_t _find_when_to_switch(count_t* sorted_r, count_t* Nr, double m, double b,
|
|
int length) except -1:
|
|
cdef int i
|
|
cdef count_t r
|
|
for i in range(length-1):
|
|
r = sorted_r[i]
|
|
if sorted_r[i+1] != r+1:
|
|
return r
|
|
g_r = gale_estimate_of_r(r, m, b)
|
|
t_r = turing_estimate_of_r(<double>r, <double>Nr[i], <double>Nr[i+1])
|
|
if abs(t_r - g_r) <= _variance(<double>r, <double>Nr[i], <double>Nr[i+1]):
|
|
return r
|
|
else:
|
|
return length - 1
|