in src/ab/plugins/db/odps_helper.py [0:0]
def sample(self, table_name, partitions=None):
"""
sample max_pt(table_name)
args:
self.max_count: rows to be returned at most
partitions: ['p1=v1/p2=v2']
returns:
sample_rate: if table is too large, sample_rate will be None
sample_count,
sample_data
"""
column_names = self.column_names(table_name)
if partitions is None:
max_pt = self.max_partition(table_name)
if max_pt == '':
# no data
return 100, 0, []
elif max_pt is None:
# not a partitoned table, full scan
partitions = None
else:
# use max_pt as default
partitions = [max_pt, ]
total_count = self.count(table_name, partitions)
# TODO is max_count common among samplers?
if total_count is not None and total_count <= self.sampler.max_count:
# return full set, no sampling
partition_condition = ODPS.join_partitions(partitions)
where = (' where ' + partition_condition) if partition_condition else ''
sql = 'select * from {table_name}{where}'.format(table_name=table_name, where=where)
logger.debug('total_count: {total_count}, max_count: {self.sampler.max_count}'.format(
total_count=total_count, self=self))
logger.debug('no need to sample, run sql:', sql)
return 100, total_count, self.table_sql(sql, table_name, column_names)
sample = self.sampler.sample(table_name, column_names, partitions, total_count)
sample_count = len(sample)
sample_rate = 100.0 * sample_count / total_count if total_count else None
return sample_rate, sample_count, sample