def sample()

in src/ab/plugins/db/odps_helper.py [0:0]


    def sample(self, table_name, partitions=None):
        """
        sample max_pt(table_name)
        args:
            self.max_count: rows to be returned at most
            partitions: ['p1=v1/p2=v2']

        returns:
            sample_rate: if table is too large, sample_rate will be None
            sample_count,
            sample_data
        """
        column_names = self.column_names(table_name)

        if partitions is None:
            max_pt = self.max_partition(table_name)
            if max_pt == '':
                # no data
                return 100, 0, []
            elif max_pt is None:
                # not a partitoned table, full scan
                partitions = None
            else:
                # use max_pt as default
                partitions = [max_pt, ]

        total_count = self.count(table_name, partitions)
        # TODO is max_count common among samplers?
        if total_count is not None and total_count <= self.sampler.max_count:
            # return full set, no sampling
            partition_condition = ODPS.join_partitions(partitions)
            where = (' where ' + partition_condition) if partition_condition else ''
            sql = 'select * from {table_name}{where}'.format(table_name=table_name, where=where)

            logger.debug('total_count: {total_count}, max_count: {self.sampler.max_count}'.format(
                total_count=total_count, self=self))
            logger.debug('no need to sample, run sql:', sql)
            return 100, total_count, self.table_sql(sql, table_name, column_names)

        sample = self.sampler.sample(table_name, column_names, partitions, total_count)
        sample_count = len(sample)
        sample_rate = 100.0 * sample_count / total_count if total_count else None
        return sample_rate, sample_count, sample