def sample()

in src/ab/plugins/db/odps_helper.py [0:0]


    def sample(self, table_name: str, column_names: list, partitions: list, total_count: int):
        """
        args:
            total_count: total row count of target partitions or whole table
        returns:
            sample_data
        """
        if total_count is None:
            raise AlgorithmException(data='选择的表过大,仅支持最旧采样')

        assert total_count > self.max_count, 'system error, total_count must be greater than sampler max_count'

        fields = ', '.join(column_names)
        condition = ODPS.join_partitions(partitions)
        if condition:
            where = 'where {condition}'.format(condition=condition)
        else:
            where = ''

        ratio = total_count // self.max_count
        sql = '''select {fields} from 
                        (select 
                            {fields}, 
                            cluster_sample({ratio}, 1) over (partition by {self.column_name}) as _column_variety_random_sampler_flag,
                            row_number() over (partition by {self.column_name}) as _column_variety_random_sampler_row_number
                            from {table_name}
                            {where}
                            ) a
                  where _column_variety_random_sampler_flag = true
                  order by _column_variety_random_sampler_row_number
                  limit {self.max_count}
                '''.format(fields=fields, ratio=ratio, self=self, table_name=table_name, where=where)

        logger.debug('sample sql:', sql)