You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

190 lines
6.0 KiB

"""Count occurrences of uint64-valued keys."""
from __future__ import division
cimport cython
from libc.math cimport log, exp, sqrt
cdef class PreshCounter:
def __init__(self, initial_size=8):
assert initial_size != 0
assert initial_size & (initial_size - 1) == 0
self.mem = Pool()
self.c_map = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))
map_init(self.mem, self.c_map, initial_size)
self.smoother = None
self.total = 0
property length:
def __get__(self):
return self.c_map.length
def __len__(self):
return self.c_map.length
def __iter__(self):
cdef int i
for i in range(self.c_map.length):
if self.c_map.cells[i].key != 0:
yield (self.c_map.cells[i].key, <count_t>self.c_map.cells[i].value)
def __getitem__(self, key_t key):
return <count_t>map_get(self.c_map, key)
cpdef int inc(self, key_t key, count_t inc) except -1:
cdef count_t c = <count_t>map_get(self.c_map, key)
c += inc
map_set(self.mem, self.c_map, key, <void*>c)
self.total += inc
return c
def prob(self, key_t key):
cdef GaleSmoother smoother
cdef void* value = map_get(self.c_map, key)
if self.smoother is not None:
smoother = self.smoother
r_star = self.smoother(<count_t>value)
return r_star / self.smoother.total
elif value == NULL:
return 0
else:
return <count_t>value / self.total
def smooth(self):
self.smoother = GaleSmoother(self)
cdef class GaleSmoother:
cdef Pool mem
cdef count_t* Nr
cdef double gradient
cdef double intercept
cdef readonly count_t cutoff
cdef count_t Nr0
cdef readonly double total
def __init__(self, PreshCounter counts):
count_counts = PreshCounter()
cdef double total = 0
for _, count in counts:
count_counts.inc(count, 1)
total += count
# If we have no items seen 1 or 2 times, this doesn't work. But, this
# won't be true in real data...
assert count_counts[1] != 0 and count_counts[2] != 0, "Cannot smooth your weird data"
# Extrapolate Nr0 from Nr1 and Nr2.
self.Nr0 = count_counts[1] + (count_counts[1] - count_counts[2])
self.mem = Pool()
cdef double[2] mb
cdef int n_counts = 0
for _ in count_counts:
n_counts += 1
sorted_r = <count_t*>count_counts.mem.alloc(n_counts, sizeof(count_t))
self.Nr = <count_t*>self.mem.alloc(n_counts, sizeof(count_t))
for i, (count, count_count) in enumerate(sorted(count_counts)):
sorted_r[i] = count
self.Nr[i] = count_count
_fit_loglinear_model(mb, sorted_r, self.Nr, n_counts)
self.cutoff = _find_when_to_switch(sorted_r, self.Nr, mb[0], mb[1],
n_counts)
self.gradient = mb[0]
self.intercept = mb[1]
self.total = self(0) * self.Nr0
for count, count_count in count_counts:
self.total += self(count) * count_count
def __call__(self, count_t r):
if r == 0:
return self.Nr[1] / self.Nr0
elif r < self.cutoff:
return turing_estimate_of_r(<double>r, <double>self.Nr[r-1], <double>self.Nr[r])
else:
return gale_estimate_of_r(<double>r, self.gradient, self.intercept)
def count_count(self, count_t r):
if r == 0:
return self.Nr0
else:
return self.Nr[r-1]
@cython.cdivision(True)
cdef double turing_estimate_of_r(double r, double Nr, double Nr1) except -1:
return ((r + 1) * Nr1) / Nr
@cython.cdivision(True)
cdef double gale_estimate_of_r(double r, double gradient, double intercept) except -1:
cdef double e_nr = exp(gradient * log(r) + intercept)
cdef double e_nr1 = exp(gradient * log(r+1) + intercept)
return (r + 1) * (e_nr1 / e_nr)
@cython.cdivision(True)
cdef void _fit_loglinear_model(double* output, count_t* sorted_r, count_t* Nr,
int length) except *:
cdef double x_mean = 0.0
cdef double y_mean = 0.0
cdef Pool mem = Pool()
x = <double*>mem.alloc(length, sizeof(double))
y = <double*>mem.alloc(length, sizeof(double))
cdef int i
for i in range(length):
r = sorted_r[i]
x[i] = log(<double>r)
y[i] = log(<double>_get_zr(i, sorted_r, Nr[i], length))
x_mean += x[i]
y_mean += y[i]
x_mean /= length
y_mean /= length
cdef double ss_xy = 0.0
cdef double ss_xx = 0.0
for i in range(length):
x_dist = x[i] - x_mean
y_dist = y[i] - y_mean
# SS_xy = sum the product of the distances from the mean
ss_xy += x_dist * y_dist
# SS_xx = sum the squares of the x distance
ss_xx += x_dist * x_dist
# Gradient
output[0] = ss_xy / ss_xx
# Intercept
output[1] = y_mean - output[0] * x_mean
@cython.cdivision(True)
cdef double _get_zr(int j, count_t* sorted_r, count_t Nr_j, int n_counts) except -1:
cdef double r_i = sorted_r[j-1] if j >= 1 else 0
cdef double r_j = sorted_r[j]
cdef double r_k = sorted_r[j+1] if (j+1) < n_counts else (2 * r_i - 1)
return 2 * Nr_j / (r_k - r_i)
@cython.cdivision(True)
cdef double _variance(double r, double Nr, double Nr1) nogil:
return 1.96 * sqrt((r+1)**2 * (Nr1 / Nr**2) * (1.0 + (Nr1 / Nr)))
@cython.cdivision(True)
cdef count_t _find_when_to_switch(count_t* sorted_r, count_t* Nr, double m, double b,
int length) except -1:
cdef int i
cdef count_t r
for i in range(length-1):
r = sorted_r[i]
if sorted_r[i+1] != r+1:
return r
g_r = gale_estimate_of_r(r, m, b)
t_r = turing_estimate_of_r(<double>r, <double>Nr[i], <double>Nr[i+1])
if abs(t_r - g_r) <= _variance(<double>r, <double>Nr[i], <double>Nr[i+1]):
return r
else:
return length - 1