def sample()

in src/ab/plugins/db/sqlite.py [0:0]


    def sample(self, table_name: str, total_count: int):
        '''
        args:
            total_count: total row count of target partitions or whole table
        returns:
            sample_rate, sample_count, sample_data
        '''
        assert total_count > self.max_count, 'system error, total_count must be greater than sampler max_count'

        table_name = Sqlite.escape(table_name)

        # step 1: try to get random self.max_count rows
        # 可以近似推导出当尝试取(2 * self.max_count + 16)行的时候取出来的行数有99.99%的概率(4个标准差)大于self.max_count行
        # 大概推导过程:
        #   假设要从n行的表中取m行。m乘以一个系数k,使得尽可能保证取出来的行数大于m行
        #   每行是否参与采样的概率p = km / n,每行执行一次,共n次,这是个B(km/n, n)的二项分布。
        #   二项分布重复n次,根据中央极限定理,结果总和服从正态分布,期望=km,方差=km * (1 - km / n)
        #   且99.993666%的概率在平均数左右四个标准差的范围内
        #   即k要满足公式:km - 4 * sqrt(km * (1 - km / n)) >= m
        #   当 n -> ∞,可得解:k >= (2 + 16 / m),即 mk >= 2m + 16
        mk = 2 * self.max_count + 16
        rand = (total_count - mk) / total_count  # rand < 0 is ok
        sql = 'SELECT * FROM {table_name} WHERE random() > {rand}'.format(table_name=table_name, rand=rand)
        sample = self.db.table_sql(sql, table_name)
        logger.debug('try to sample {mk} rows'.format(mk=mk))
        logger.debug('run sql:', sql)
        logger.debug('get sample count:', len(sample))

        # step 2: sample self.max_count rows
        if len(sample) > self.max_count:
            sample = random.sample(sample, self.max_count)
        row_count = len(sample)
        return 100.0 * row_count / total_count, row_count, sample