in src/ab/plugins/db/sqlite.py [0:0]
def sample(self, table_name: str, total_count: int):
'''
args:
total_count: total row count of target partitions or whole table
returns:
sample_rate, sample_count, sample_data
'''
assert total_count > self.max_count, 'system error, total_count must be greater than sampler max_count'
table_name = Sqlite.escape(table_name)
# step 1: try to get random self.max_count rows
# 可以近似推导出当尝试取(2 * self.max_count + 16)行的时候取出来的行数有99.99%的概率(4个标准差)大于self.max_count行
# 大概推导过程:
# 假设要从n行的表中取m行。m乘以一个系数k,使得尽可能保证取出来的行数大于m行
# 每行是否参与采样的概率p = km / n,每行执行一次,共n次,这是个B(km/n, n)的二项分布。
# 二项分布重复n次,根据中央极限定理,结果总和服从正态分布,期望=km,方差=km * (1 - km / n)
# 且99.993666%的概率在平均数左右四个标准差的范围内
# 即k要满足公式:km - 4 * sqrt(km * (1 - km / n)) >= m
# 当 n -> ∞,可得解:k >= (2 + 16 / m),即 mk >= 2m + 16
mk = 2 * self.max_count + 16
rand = (total_count - mk) / total_count # rand < 0 is ok
sql = 'SELECT * FROM {table_name} WHERE random() > {rand}'.format(table_name=table_name, rand=rand)
sample = self.db.table_sql(sql, table_name)
logger.debug('try to sample {mk} rows'.format(mk=mk))
logger.debug('run sql:', sql)
logger.debug('get sample count:', len(sample))
# step 2: sample self.max_count rows
if len(sample) > self.max_count:
sample = random.sample(sample, self.max_count)
row_count = len(sample)
return 100.0 * row_count / total_count, row_count, sample