in odps/df/expr/collections.py [0:0]
def sample(expr, parts=None, columns=None, i=None, n=None, frac=None, replace=False,
weights=None, strata=None, random_state=None):
"""
Sample collection.
:param expr: collection
:param parts: how many parts to hash
:param columns: the columns to sample
:param i: the part to sample out, can be a list of parts, must be from 0 to parts-1
:param n: how many rows to sample. If `strata` is specified, `n` should be a dict with values in the strata column as dictionary keys and corresponding sample size as values
:param frac: how many fraction to sample. If `strata` is specified, `n` should be a dict with values in the strata column as dictionary keys and corresponding sample weight as values
:param replace: whether to perform replace sampling
:param weights: the column name of weights
:param strata: the name of strata column
:param random_state: the random seed when performing sampling
:return: collection
Note that n, frac, replace, weights, strata and random_state can only be used under Pandas DataFrames or
XFlow.
:Example:
Sampling with parts:
>>> df.sample(parts=1)
>>> df.sample(parts=5, i=0)
>>> df.sample(parts=10, columns=['name'])
Sampling with fraction or weights, replacement option can be specified:
>>> df.sample(n=100)
>>> df.sample(frac=0.1)
>>> df.sample(frac=0.1, replace=True)
Sampling with weight column:
>>> df.sample(n=100, weights='weight_col')
>>> df.sample(n=100, weights='weight_col', replace=True)
Stratified sampling. Note that currently we do not support stratified sampling with replacement.
>>> df.sample(strata='category', frac={'Iris Setosa': 0.5, 'Iris Versicolour': 0.4})
"""
if isinstance(expr, CollectionExpr):
if n is None and frac is None and parts is None:
raise ExpressionError('Either n or frac or parts should be provided')
if i is not None and parts is None:
raise ExpressionError('`parts` arg is required when `i` arg is specified')
if len([arg for arg in (n, frac, parts) if arg is not None]) > 1:
raise ExpressionError('You cannot specify `n` or `frac` or `parts` at the same time')
if strata is None and n is not None and frac is not None:
# strata can specify different types of strategies on different columns
raise ExpressionError('You cannot specify `n` and `frac` at the same time.')
if weights is not None and strata is not None:
raise ExpressionError('You cannot specify `weights` and `strata` at the same time.')
if strata is not None:
if frac is not None and not isinstance(frac, (six.string_types, dict)):
raise ExpressionError('`frac` should be a k-v string or a dictionary object.')
if isinstance(frac, six.string_types):
frac = str_to_kv(frac, float)
if n is not None and not isinstance(n, (six.string_types, dict)):
raise ExpressionError('`n` should be a k-v string or a dictionary object.')
if isinstance(n, six.string_types):
n = str_to_kv(n, int)
for val in six.itervalues(frac or dict()):
if val < 0 or val > 1:
raise ExpressionError('Values in `frac` must be between 0 and 1')
if n is not None and frac is not None:
collides = set(six.iterkeys(n)).intersection(set(six.iterkeys(frac)))
if collides:
raise ExpressionError('Values in `frac` and `n` collides with each other.')
else:
if frac is not None and (not isinstance(frac, (six.integer_types, float)) or frac < 0 or frac > 1):
raise ExpressionError('`frac` must be between 0 and 1')
if parts is not None:
if i is None:
i = (0, )
elif isinstance(i, list):
i = tuple(i)
elif not isinstance(i, tuple):
i = (i, )
for it in i:
if it >= parts or it < 0:
raise ExpressionError('`i` should be positive numbers that less than `parts`')
elif not options.df.use_xflow_sample and not replace and weights is None and strata is None:
if frac is not None and frac < 0.01:
raise ValueError(
"Does not support sampling less than 1%. Try sampling by count or "
"set options.df.use_xflow_sample to True."
)
elif hasattr(expr, '_xflow_sample'):
return expr._xflow_sample(n=n, frac=frac, replace=replace, weights=weights, strata=strata,
random_state=random_state)
return expr.__sample(parts=parts, columns=columns, i=i, n=n, frac=frac, replace=replace,
weights=weights, strata=strata, random_state=random_state)