in src/ab/plugins/db/odps_helper.py [0:0]
def sample(self, table_name: str, column_names: list, partitions: list, total_count: int):
"""
args:
total_count: total row count of target partitions or whole table
returns:
sample_data
"""
if total_count is None:
raise AlgorithmException(data='选择的表过大,仅支持最旧采样')
assert total_count > self.max_count, 'system error, total_count must be greater than sampler max_count'
fields = ', '.join(column_names)
condition = ODPS.join_partitions(partitions)
if condition:
where = 'where {condition}'.format(condition=condition)
else:
where = ''
ratio = total_count // self.max_count
sql = '''select {fields} from
(select
{fields},
cluster_sample({ratio}, 1) over (partition by {self.column_name}) as _column_variety_random_sampler_flag,
row_number() over (partition by {self.column_name}) as _column_variety_random_sampler_row_number
from {table_name}
{where}
) a
where _column_variety_random_sampler_flag = true
order by _column_variety_random_sampler_row_number
limit {self.max_count}
'''.format(fields=fields, ratio=ratio, self=self, table_name=table_name, where=where)
logger.debug('sample sql:', sql)