in src/ab/plugins/db/odps_helper.py [0:0]
def sample(self, table_name: str, column_names: list, partitions: list, total_count: int):
"""
args:
total_count: total row count of target partitions or whole table
returns:
sample_data
"""
if total_count is None:
raise AlgorithmException(data='选择的表过大,仅支持最旧采样')
assert total_count > self.max_count, 'system error, total_count must be greater than sampler max_count'
condition = ODPS.join_partitions(partitions)
ratio = total_count // self.max_count
sample_condition = 'sample({ratio}) = true'.format(ratio=ratio)
if condition:
condition = '({condition}) and {sample_condition}'.format(condition=condition, sample_condition=sample_condition)
else:
condition = sample_condition
if condition:
where = ' where {condition}'.format(condition=condition)
else:
where = ''
fields = ', '.join(column_names)
sql = 'select {fields} from {table_name}{where} limit {self.max_count}'.format(
fields=fields, table_name=table_name, where=where, self=self)
logger.debug('sample sql:', sql)
return self.db.table_sql(sql, table_name, column_names)