in src/python/turicreate/data_structures/sframe.py [0:0]
def groupby(self, key_column_names, operations, *args):
"""
Perform a group on the key_column_names followed by aggregations on the
columns listed in operations.
The operations parameter is a dictionary that indicates which
aggregation operators to use and which columns to use them on. The
available operators are SUM, MAX, MIN, COUNT, AVG, VAR, STDV, CONCAT,
SELECT_ONE, ARGMIN, ARGMAX, and QUANTILE. For convenience, aggregators
MEAN, STD, and VARIANCE are available as synonyms for AVG, STDV, and
VAR. See :mod:`~turicreate.aggregate` for more detail on the aggregators.
Parameters
----------
key_column_names : string | list[string]
Column(s) to group by. Key columns can be of any type other than
dictionary.
operations : dict, list
Dictionary of columns and aggregation operations. Each key is a
output column name and each value is an aggregator. This can also
be a list of aggregators, in which case column names will be
automatically assigned.
*args
All other remaining arguments will be interpreted in the same
way as the operations argument.
Returns
-------
out_sf : SFrame
A new SFrame, with a column for each groupby column and each
aggregation operation.
See Also
--------
aggregate
Notes
-----
* Numeric aggregators (such as sum, mean, stdev etc.) follow the skip
None policy i.e they will omit all missing values from the aggregation.
As an example, `sum([None, 5, 10]) = 15` because the `None` value is
skipped.
* Aggregators have a default value when no values (after skipping all
`None` values) are present. Default values are `None` for ['ARGMAX',
'ARGMIN', 'AVG', 'STD', 'MEAN', 'MIN', 'MAX'], `0` for ['COUNT'
'COUNT_DISTINCT', 'DISTINCT'] `[]` for 'CONCAT', 'QUANTILE',
'DISTINCT', and `{}` for 'FREQ_COUNT'.
Examples
--------
Suppose we have an SFrame with movie ratings by many users.
>>> import turicreate.aggregate as agg
>>> url = 'https://static.turi.com/datasets/rating_data_example.csv'
>>> sf = turicreate.SFrame.read_csv(url)
>>> sf
+---------+----------+--------+
| user_id | movie_id | rating |
+---------+----------+--------+
| 25904 | 1663 | 3 |
| 25907 | 1663 | 3 |
| 25923 | 1663 | 3 |
| 25924 | 1663 | 3 |
| 25928 | 1663 | 2 |
| 25933 | 1663 | 4 |
| 25934 | 1663 | 4 |
| 25935 | 1663 | 4 |
| 25936 | 1663 | 5 |
| 25937 | 1663 | 2 |
| ... | ... | ... |
+---------+----------+--------+
[10000 rows x 3 columns]
Compute the number of occurrences of each user.
>>> user_count = sf.groupby(key_column_names='user_id',
... operations={'count': agg.COUNT()})
>>> user_count
+---------+-------+
| user_id | count |
+---------+-------+
| 62361 | 1 |
| 30727 | 1 |
| 40111 | 1 |
| 50513 | 1 |
| 35140 | 1 |
| 42352 | 1 |
| 29667 | 1 |
| 46242 | 1 |
| 58310 | 1 |
| 64614 | 1 |
| ... | ... |
+---------+-------+
[9852 rows x 2 columns]
Compute the mean and standard deviation of ratings per user.
>>> user_rating_stats = sf.groupby(key_column_names='user_id',
... operations={
... 'mean_rating': agg.MEAN('rating'),
... 'std_rating': agg.STD('rating')
... })
>>> user_rating_stats
+---------+-------------+------------+
| user_id | mean_rating | std_rating |
+---------+-------------+------------+
| 62361 | 5.0 | 0.0 |
| 30727 | 4.0 | 0.0 |
| 40111 | 2.0 | 0.0 |
| 50513 | 4.0 | 0.0 |
| 35140 | 4.0 | 0.0 |
| 42352 | 5.0 | 0.0 |
| 29667 | 4.0 | 0.0 |
| 46242 | 5.0 | 0.0 |
| 58310 | 2.0 | 0.0 |
| 64614 | 2.0 | 0.0 |
| ... | ... | ... |
+---------+-------------+------------+
[9852 rows x 3 columns]
Compute the movie with the minimum rating per user.
>>> chosen_movies = sf.groupby(key_column_names='user_id',
... operations={
... 'worst_movies': agg.ARGMIN('rating','movie_id')
... })
>>> chosen_movies
+---------+-------------+
| user_id | worst_movies |
+---------+-------------+
| 62361 | 1663 |
| 30727 | 1663 |
| 40111 | 1663 |
| 50513 | 1663 |
| 35140 | 1663 |
| 42352 | 1663 |
| 29667 | 1663 |
| 46242 | 1663 |
| 58310 | 1663 |
| 64614 | 1663 |
| ... | ... |
+---------+-------------+
[9852 rows x 2 columns]
Compute the movie with the max rating per user and also the movie with
the maximum imdb-ranking per user.
>>> sf['imdb-ranking'] = sf['rating'] * 10
>>> chosen_movies = sf.groupby(key_column_names='user_id',
... operations={('max_rating_movie','max_imdb_ranking_movie'): agg.ARGMAX(('rating','imdb-ranking'),'movie_id')})
>>> chosen_movies
+---------+------------------+------------------------+
| user_id | max_rating_movie | max_imdb_ranking_movie |
+---------+------------------+------------------------+
| 62361 | 1663 | 16630 |
| 30727 | 1663 | 16630 |
| 40111 | 1663 | 16630 |
| 50513 | 1663 | 16630 |
| 35140 | 1663 | 16630 |
| 42352 | 1663 | 16630 |
| 29667 | 1663 | 16630 |
| 46242 | 1663 | 16630 |
| 58310 | 1663 | 16630 |
| 64614 | 1663 | 16630 |
| ... | ... | ... |
+---------+------------------+------------------------+
[9852 rows x 3 columns]
Compute the movie with the max rating per user.
>>> chosen_movies = sf.groupby(key_column_names='user_id',
operations={'best_movies': agg.ARGMAX('rating','movie')})
Compute the movie with the max rating per user and also the movie with the maximum imdb-ranking per user.
>>> chosen_movies = sf.groupby(key_column_names='user_id',
operations={('max_rating_movie','max_imdb_ranking_movie'): agg.ARGMAX(('rating','imdb-ranking'),'movie')})
Compute the count, mean, and standard deviation of ratings per (user,
time), automatically assigning output column names.
>>> sf['time'] = sf.apply(lambda x: (x['user_id'] + x['movie_id']) % 11 + 2000)
>>> user_rating_stats = sf.groupby(['user_id', 'time'],
... [agg.COUNT(),
... agg.AVG('rating'),
... agg.STDV('rating')])
>>> user_rating_stats
+------+---------+-------+---------------+----------------+
| time | user_id | Count | Avg of rating | Stdv of rating |
+------+---------+-------+---------------+----------------+
| 2006 | 61285 | 1 | 4.0 | 0.0 |
| 2000 | 36078 | 1 | 4.0 | 0.0 |
| 2003 | 47158 | 1 | 3.0 | 0.0 |
| 2007 | 34446 | 1 | 3.0 | 0.0 |
| 2010 | 47990 | 1 | 3.0 | 0.0 |
| 2003 | 42120 | 1 | 5.0 | 0.0 |
| 2007 | 44940 | 1 | 4.0 | 0.0 |
| 2008 | 58240 | 1 | 4.0 | 0.0 |
| 2002 | 102 | 1 | 1.0 | 0.0 |
| 2009 | 52708 | 1 | 3.0 | 0.0 |
| ... | ... | ... | ... | ... |
+------+---------+-------+---------------+----------------+
[10000 rows x 5 columns]
The groupby function can take a variable length list of aggregation
specifiers so if we want the count and the 0.25 and 0.75 quantiles of
ratings:
>>> user_rating_stats = sf.groupby(['user_id', 'time'], agg.COUNT(),
... {'rating_quantiles': agg.QUANTILE('rating',[0.25, 0.75])})
>>> user_rating_stats
+------+---------+-------+------------------------+
| time | user_id | Count | rating_quantiles |
+------+---------+-------+------------------------+
| 2006 | 61285 | 1 | array('d', [4.0, 4.0]) |
| 2000 | 36078 | 1 | array('d', [4.0, 4.0]) |
| 2003 | 47158 | 1 | array('d', [3.0, 3.0]) |
| 2007 | 34446 | 1 | array('d', [3.0, 3.0]) |
| 2010 | 47990 | 1 | array('d', [3.0, 3.0]) |
| 2003 | 42120 | 1 | array('d', [5.0, 5.0]) |
| 2007 | 44940 | 1 | array('d', [4.0, 4.0]) |
| 2008 | 58240 | 1 | array('d', [4.0, 4.0]) |
| 2002 | 102 | 1 | array('d', [1.0, 1.0]) |
| 2009 | 52708 | 1 | array('d', [3.0, 3.0]) |
| ... | ... | ... | ... |
+------+---------+-------+------------------------+
[10000 rows x 4 columns]
To put all items a user rated into one list value by their star rating:
>>> user_rating_stats = sf.groupby(["user_id", "rating"],
... {"rated_movie_ids":agg.CONCAT("movie_id")})
>>> user_rating_stats
+--------+---------+----------------------+
| rating | user_id | rated_movie_ids |
+--------+---------+----------------------+
| 3 | 31434 | array('d', [1663.0]) |
| 5 | 25944 | array('d', [1663.0]) |
| 4 | 38827 | array('d', [1663.0]) |
| 4 | 51437 | array('d', [1663.0]) |
| 4 | 42549 | array('d', [1663.0]) |
| 4 | 49532 | array('d', [1663.0]) |
| 3 | 26124 | array('d', [1663.0]) |
| 4 | 46336 | array('d', [1663.0]) |
| 4 | 52133 | array('d', [1663.0]) |
| 5 | 62361 | array('d', [1663.0]) |
| ... | ... | ... |
+--------+---------+----------------------+
[9952 rows x 3 columns]
To put all items and rating of a given user together into a dictionary
value:
>>> user_rating_stats = sf.groupby("user_id",
... {"movie_rating":agg.CONCAT("movie_id", "rating")})
>>> user_rating_stats
+---------+--------------+
| user_id | movie_rating |
+---------+--------------+
| 62361 | {1663: 5} |
| 30727 | {1663: 4} |
| 40111 | {1663: 2} |
| 50513 | {1663: 4} |
| 35140 | {1663: 4} |
| 42352 | {1663: 5} |
| 29667 | {1663: 4} |
| 46242 | {1663: 5} |
| 58310 | {1663: 2} |
| 64614 | {1663: 2} |
| ... | ... |
+---------+--------------+
[9852 rows x 2 columns]
"""
# some basic checking first
# make sure key_column_names is a list
if isinstance(key_column_names, str):
key_column_names = [key_column_names]
# check that every column is a string, and is a valid column name
my_column_names = self.column_names()
key_columns_array = []
for column in key_column_names:
if not isinstance(column, str):
raise TypeError("Column name must be a string")
if column not in my_column_names:
raise KeyError("Column \"" + column + "\" does not exist in SFrame")
if self[column].dtype == dict:
raise TypeError("Cannot group on a dictionary column.")
key_columns_array.append(column)
group_output_columns = []
group_columns = []
group_ops = []
all_ops = [operations] + list(args)
for op_entry in all_ops:
# if it is not a dict, nor a list, it is just a single aggregator
# element (probably COUNT). wrap it in a list so we can reuse the
# list processing code
operation = op_entry
if not (isinstance(operation, list) or isinstance(operation, dict)):
operation = [operation]
if isinstance(operation, dict):
# now sweep the dict and add to group_columns and group_ops
for key in operation:
val = operation[key]
if type(val) is tuple:
(op, column) = val
if op == "__builtin__avg__" and self[column[0]].dtype in [
array.array,
numpy.ndarray,
]:
op = "__builtin__vector__avg__"
if op == "__builtin__sum__" and self[column[0]].dtype in [
array.array,
numpy.ndarray,
]:
op = "__builtin__vector__sum__"
if (
op == "__builtin__argmax__" or op == "__builtin__argmin__"
) and ((type(column[0]) is tuple) != (type(key) is tuple)):
raise TypeError(
"Output column(s) and aggregate column(s) for aggregate operation should be either all tuple or all string."
)
if (
op == "__builtin__argmax__" or op == "__builtin__argmin__"
) and type(column[0]) is tuple:
for (col, output) in zip(column[0], key):
group_columns = group_columns + [[col, column[1]]]
group_ops = group_ops + [op]
group_output_columns = group_output_columns + [output]
else:
group_columns = group_columns + [column]
group_ops = group_ops + [op]
group_output_columns = group_output_columns + [key]
if op == "__builtin__concat__dict__":
key_column = column[0]
key_column_type = self.select_column(key_column).dtype
if not key_column_type in (int, float, str):
raise TypeError(
"CONCAT key column must be int, float or str type"
)
elif val == aggregate.COUNT:
group_output_columns = group_output_columns + [key]
val = aggregate.COUNT()
(op, column) = val
group_columns = group_columns + [column]
group_ops = group_ops + [op]
else:
raise TypeError(
"Unexpected type in aggregator definition of output column: "
+ key
)
elif isinstance(operation, list):
# we will be using automatically defined column names
for val in operation:
if type(val) is tuple:
(op, column) = val
if op == "__builtin__avg__" and self[column[0]].dtype in [
array.array,
numpy.ndarray,
]:
op = "__builtin__vector__avg__"
if op == "__builtin__sum__" and self[column[0]].dtype in [
array.array,
numpy.ndarray,
]:
op = "__builtin__vector__sum__"
if (
op == "__builtin__argmax__" or op == "__builtin__argmin__"
) and type(column[0]) is tuple:
for col in column[0]:
group_columns = group_columns + [[col, column[1]]]
group_ops = group_ops + [op]
group_output_columns = group_output_columns + [""]
else:
group_columns = group_columns + [column]
group_ops = group_ops + [op]
group_output_columns = group_output_columns + [""]
if op == "__builtin__concat__dict__":
key_column = column[0]
key_column_type = self.select_column(key_column).dtype
if not key_column_type in (int, float, str):
raise TypeError(
"CONCAT key column must be int, float or str type"
)
elif val == aggregate.COUNT:
group_output_columns = group_output_columns + [""]
val = aggregate.COUNT()
(op, column) = val
group_columns = group_columns + [column]
group_ops = group_ops + [op]
else:
raise TypeError("Unexpected type in aggregator definition.")
# let's validate group_columns and group_ops are valid
for (cols, op) in zip(group_columns, group_ops):
for col in cols:
if not isinstance(col, str):
raise TypeError("Column name must be a string")
if not isinstance(op, str):
raise TypeError("Operation type not recognized.")
if op is not aggregate.COUNT()[0]:
for col in cols:
if col not in my_column_names:
raise KeyError("Column " + col + " does not exist in SFrame")
with cython_context():
return SFrame(
_proxy=self.__proxy__.groupby_aggregate(
key_columns_array, group_columns, group_output_columns, group_ops
)
)