def sample()

in odps/df/expr/collections.py [0:0]
53 lines of code
47 McCabe index (conditional complexity)

def sample(expr, parts=None, columns=None, i=None, n=None, frac=None, replace=False,
           weights=None, strata=None, random_state=None):
    """
    Sample collection.

    :param expr: collection
    :param parts: how many parts to hash
    :param columns: the columns to sample
    :param i: the part to sample out, can be a list of parts, must be from 0 to parts-1
    :param n: how many rows to sample. If `strata` is specified, `n` should be a dict with values in the strata column as dictionary keys and corresponding sample size as values
    :param frac: how many fraction to sample. If `strata` is specified, `n` should be a dict with values in the strata column as dictionary keys and corresponding sample weight as values
    :param replace: whether to perform replace sampling
    :param weights: the column name of weights
    :param strata: the name of strata column
    :param random_state: the random seed when performing sampling
    :return: collection

    Note that n, frac, replace, weights, strata and random_state can only be used under Pandas DataFrames or
    XFlow.

    :Example:

    Sampling with parts:

    >>> df.sample(parts=1)
    >>> df.sample(parts=5, i=0)
    >>> df.sample(parts=10, columns=['name'])

    Sampling with fraction or weights, replacement option can be specified:

    >>> df.sample(n=100)
    >>> df.sample(frac=0.1)
    >>> df.sample(frac=0.1, replace=True)

    Sampling with weight column:

    >>> df.sample(n=100, weights='weight_col')
    >>> df.sample(n=100, weights='weight_col', replace=True)

    Stratified sampling. Note that currently we do not support stratified sampling with replacement.

    >>> df.sample(strata='category', frac={'Iris Setosa': 0.5, 'Iris Versicolour': 0.4})
    """
    if isinstance(expr, CollectionExpr):
        if n is None and frac is None and parts is None:
            raise ExpressionError('Either n or frac or parts should be provided')
        if i is not None and parts is None:
            raise ExpressionError('`parts` arg is required when `i` arg is specified')
        if len([arg for arg in (n, frac, parts) if arg is not None]) > 1:
            raise ExpressionError('You cannot specify `n` or `frac` or `parts` at the same time')
        if strata is None and n is not None and frac is not None:
            # strata can specify different types of strategies on different columns
            raise ExpressionError('You cannot specify `n` and `frac` at the same time.')
        if weights is not None and strata is not None:
            raise ExpressionError('You cannot specify `weights` and `strata` at the same time.')
        if strata is not None:
            if frac is not None and not isinstance(frac, (six.string_types, dict)):
                raise ExpressionError('`frac` should be a k-v string or a dictionary object.')
            if isinstance(frac, six.string_types):
                frac = str_to_kv(frac, float)

            if n is not None and not isinstance(n, (six.string_types, dict)):
                raise ExpressionError('`n` should be a k-v string or a dictionary object.')
            if isinstance(n, six.string_types):
                n = str_to_kv(n, int)

            for val in six.itervalues(frac or dict()):
                if val < 0 or val > 1:
                    raise ExpressionError('Values in `frac` must be between 0 and 1')
            if n is not None and frac is not None:
                collides = set(six.iterkeys(n)).intersection(set(six.iterkeys(frac)))
                if collides:
                    raise ExpressionError('Values in `frac` and `n` collides with each other.')
        else:
            if frac is not None and (not isinstance(frac, (six.integer_types, float)) or frac < 0 or frac > 1):
                raise ExpressionError('`frac` must be between 0 and 1')

        if parts is not None:
            if i is None:
                i = (0, )
            elif isinstance(i, list):
                i = tuple(i)
            elif not isinstance(i, tuple):
                i = (i, )

            for it in i:
                if it >= parts or it < 0:
                    raise ExpressionError('`i` should be positive numbers that less than `parts`')
        elif not options.df.use_xflow_sample and not replace and weights is None and strata is None:
            if frac is not None and frac < 0.01:
                raise ValueError(
                    "Does not support sampling less than 1%. Try sampling by count or "
                    "set options.df.use_xflow_sample to True."
                )
        elif hasattr(expr, '_xflow_sample'):
            return expr._xflow_sample(n=n, frac=frac, replace=replace, weights=weights, strata=strata,
                                      random_state=random_state)

        return expr.__sample(parts=parts, columns=columns, i=i, n=n, frac=frac, replace=replace,
                             weights=weights, strata=strata, random_state=random_state)