in src/ab/plugins/db/odps_helper.py [0:0]
def sample(self, table_name: str, column_names: list, partitions: list, total_count: int):
"""
args:
total_count: total row count of target partitions or whole table
returns:
sample_data
"""
if total_count is None:
raise AlgorithmException(data='选择的表过大,仅支持最旧采样')
assert total_count > self.max_count, 'system error, total_count must be greater than sampler max_count'
condition = ODPS.join_partitions(partitions)
if condition:
inner_where = ' where {condition}'.format(condition=condition)
else:
inner_where = ''
fields = ', '.join(column_names)
sql = """select {fields} from
(select {fields}, row_number() over (partition by 1) as _xlab_tail_sampling_row_number from {table_name}{inner_where}) a
where _xlab_tail_sampling_row_number > {rn}""".format(
fields=fields, table_name=table_name, inner_where=inner_where, rn=total_count - self.max_count
)
logger.debug('sample sql:', sql)
return self.db.table_sql(sql, table_name, column_names)