def groupby()

in src/python/turicreate/data_structures/sframe.py [0:0]
127 lines of code
52 McCabe index (conditional complexity)

    def groupby(self, key_column_names, operations, *args):
        """
        Perform a group on the key_column_names followed by aggregations on the
        columns listed in operations.

        The operations parameter is a dictionary that indicates which
        aggregation operators to use and which columns to use them on. The
        available operators are SUM, MAX, MIN, COUNT, AVG, VAR, STDV, CONCAT,
        SELECT_ONE, ARGMIN, ARGMAX, and QUANTILE. For convenience, aggregators
        MEAN, STD, and VARIANCE are available as synonyms for AVG, STDV, and
        VAR. See :mod:`~turicreate.aggregate` for more detail on the aggregators.

        Parameters
        ----------
        key_column_names : string | list[string]
            Column(s) to group by. Key columns can be of any type other than
            dictionary.

        operations : dict, list
            Dictionary of columns and aggregation operations. Each key is a
            output column name and each value is an aggregator. This can also
            be a list of aggregators, in which case column names will be
            automatically assigned.

        *args
            All other remaining arguments will be interpreted in the same
            way as the operations argument.

        Returns
        -------
        out_sf : SFrame
            A new SFrame, with a column for each groupby column and each
            aggregation operation.

        See Also
        --------
        aggregate

        Notes
        -----
        * Numeric aggregators (such as sum, mean, stdev etc.) follow the skip
        None policy i.e they will omit all missing values from the aggregation.
        As an example, `sum([None, 5, 10]) = 15` because the `None` value is
        skipped.
        * Aggregators have a default value when no values (after skipping all
        `None` values) are present. Default values are `None` for ['ARGMAX',
        'ARGMIN', 'AVG', 'STD', 'MEAN', 'MIN', 'MAX'],  `0` for ['COUNT'
        'COUNT_DISTINCT', 'DISTINCT'] `[]` for 'CONCAT', 'QUANTILE',
        'DISTINCT', and `{}` for 'FREQ_COUNT'.

        Examples
        --------
        Suppose we have an SFrame with movie ratings by many users.

        >>> import turicreate.aggregate as agg
        >>> url = 'https://static.turi.com/datasets/rating_data_example.csv'
        >>> sf = turicreate.SFrame.read_csv(url)
        >>> sf
        +---------+----------+--------+
        | user_id | movie_id | rating |
        +---------+----------+--------+
        |  25904  |   1663   |   3    |
        |  25907  |   1663   |   3    |
        |  25923  |   1663   |   3    |
        |  25924  |   1663   |   3    |
        |  25928  |   1663   |   2    |
        |  25933  |   1663   |   4    |
        |  25934  |   1663   |   4    |
        |  25935  |   1663   |   4    |
        |  25936  |   1663   |   5    |
        |  25937  |   1663   |   2    |
        |   ...   |   ...    |  ...   |
        +---------+----------+--------+
        [10000 rows x 3 columns]

        Compute the number of occurrences of each user.

        >>> user_count = sf.groupby(key_column_names='user_id',
        ...                         operations={'count': agg.COUNT()})
        >>> user_count
        +---------+-------+
        | user_id | count |
        +---------+-------+
        |  62361  |   1   |
        |  30727  |   1   |
        |  40111  |   1   |
        |  50513  |   1   |
        |  35140  |   1   |
        |  42352  |   1   |
        |  29667  |   1   |
        |  46242  |   1   |
        |  58310  |   1   |
        |  64614  |   1   |
        |   ...   |  ...  |
        +---------+-------+
        [9852 rows x 2 columns]

        Compute the mean and standard deviation of ratings per user.

        >>> user_rating_stats = sf.groupby(key_column_names='user_id',
        ...                                operations={
        ...                                    'mean_rating': agg.MEAN('rating'),
        ...                                    'std_rating': agg.STD('rating')
        ...                                })
        >>> user_rating_stats
        +---------+-------------+------------+
        | user_id | mean_rating | std_rating |
        +---------+-------------+------------+
        |  62361  |     5.0     |    0.0     |
        |  30727  |     4.0     |    0.0     |
        |  40111  |     2.0     |    0.0     |
        |  50513  |     4.0     |    0.0     |
        |  35140  |     4.0     |    0.0     |
        |  42352  |     5.0     |    0.0     |
        |  29667  |     4.0     |    0.0     |
        |  46242  |     5.0     |    0.0     |
        |  58310  |     2.0     |    0.0     |
        |  64614  |     2.0     |    0.0     |
        |   ...   |     ...     |    ...     |
        +---------+-------------+------------+
        [9852 rows x 3 columns]

        Compute the movie with the minimum rating per user.

        >>> chosen_movies = sf.groupby(key_column_names='user_id',
        ...                            operations={
        ...                                'worst_movies': agg.ARGMIN('rating','movie_id')
        ...                            })
        >>> chosen_movies
        +---------+-------------+
        | user_id | worst_movies |
        +---------+-------------+
        |  62361  |     1663    |
        |  30727  |     1663    |
        |  40111  |     1663    |
        |  50513  |     1663    |
        |  35140  |     1663    |
        |  42352  |     1663    |
        |  29667  |     1663    |
        |  46242  |     1663    |
        |  58310  |     1663    |
        |  64614  |     1663    |
        |   ...   |     ...     |
        +---------+-------------+
        [9852 rows x 2 columns]

        Compute the movie with the max rating per user and also the movie with
        the maximum imdb-ranking per user.

        >>> sf['imdb-ranking'] = sf['rating'] * 10
        >>> chosen_movies = sf.groupby(key_column_names='user_id',
        ...         operations={('max_rating_movie','max_imdb_ranking_movie'): agg.ARGMAX(('rating','imdb-ranking'),'movie_id')})
        >>> chosen_movies
        +---------+------------------+------------------------+
        | user_id | max_rating_movie | max_imdb_ranking_movie |
        +---------+------------------+------------------------+
        |  62361  |       1663       |          16630         |
        |  30727  |       1663       |          16630         |
        |  40111  |       1663       |          16630         |
        |  50513  |       1663       |          16630         |
        |  35140  |       1663       |          16630         |
        |  42352  |       1663       |          16630         |
        |  29667  |       1663       |          16630         |
        |  46242  |       1663       |          16630         |
        |  58310  |       1663       |          16630         |
        |  64614  |       1663       |          16630         |
        |   ...   |       ...        |          ...           |
        +---------+------------------+------------------------+
        [9852 rows x 3 columns]

        Compute the movie with the max rating per user.

        >>> chosen_movies = sf.groupby(key_column_names='user_id',
                    operations={'best_movies': agg.ARGMAX('rating','movie')})

        Compute the movie with the max rating per user and also the movie with the maximum imdb-ranking per user.

        >>> chosen_movies = sf.groupby(key_column_names='user_id',
                   operations={('max_rating_movie','max_imdb_ranking_movie'): agg.ARGMAX(('rating','imdb-ranking'),'movie')})

        Compute the count, mean, and standard deviation of ratings per (user,
        time), automatically assigning output column names.

        >>> sf['time'] = sf.apply(lambda x: (x['user_id'] + x['movie_id']) % 11 + 2000)
        >>> user_rating_stats = sf.groupby(['user_id', 'time'],
        ...                                [agg.COUNT(),
        ...                                 agg.AVG('rating'),
        ...                                 agg.STDV('rating')])
        >>> user_rating_stats
        +------+---------+-------+---------------+----------------+
        | time | user_id | Count | Avg of rating | Stdv of rating |
        +------+---------+-------+---------------+----------------+
        | 2006 |  61285  |   1   |      4.0      |      0.0       |
        | 2000 |  36078  |   1   |      4.0      |      0.0       |
        | 2003 |  47158  |   1   |      3.0      |      0.0       |
        | 2007 |  34446  |   1   |      3.0      |      0.0       |
        | 2010 |  47990  |   1   |      3.0      |      0.0       |
        | 2003 |  42120  |   1   |      5.0      |      0.0       |
        | 2007 |  44940  |   1   |      4.0      |      0.0       |
        | 2008 |  58240  |   1   |      4.0      |      0.0       |
        | 2002 |   102   |   1   |      1.0      |      0.0       |
        | 2009 |  52708  |   1   |      3.0      |      0.0       |
        | ...  |   ...   |  ...  |      ...      |      ...       |
        +------+---------+-------+---------------+----------------+
        [10000 rows x 5 columns]


        The groupby function can take a variable length list of aggregation
        specifiers so if we want the count and the 0.25 and 0.75 quantiles of
        ratings:

        >>> user_rating_stats = sf.groupby(['user_id', 'time'], agg.COUNT(),
        ...                                {'rating_quantiles': agg.QUANTILE('rating',[0.25, 0.75])})
        >>> user_rating_stats
        +------+---------+-------+------------------------+
        | time | user_id | Count |    rating_quantiles    |
        +------+---------+-------+------------------------+
        | 2006 |  61285  |   1   | array('d', [4.0, 4.0]) |
        | 2000 |  36078  |   1   | array('d', [4.0, 4.0]) |
        | 2003 |  47158  |   1   | array('d', [3.0, 3.0]) |
        | 2007 |  34446  |   1   | array('d', [3.0, 3.0]) |
        | 2010 |  47990  |   1   | array('d', [3.0, 3.0]) |
        | 2003 |  42120  |   1   | array('d', [5.0, 5.0]) |
        | 2007 |  44940  |   1   | array('d', [4.0, 4.0]) |
        | 2008 |  58240  |   1   | array('d', [4.0, 4.0]) |
        | 2002 |   102   |   1   | array('d', [1.0, 1.0]) |
        | 2009 |  52708  |   1   | array('d', [3.0, 3.0]) |
        | ...  |   ...   |  ...  |          ...           |
        +------+---------+-------+------------------------+
        [10000 rows x 4 columns]

        To put all items a user rated into one list value by their star rating:

        >>> user_rating_stats = sf.groupby(["user_id", "rating"],
        ...                                {"rated_movie_ids":agg.CONCAT("movie_id")})
        >>> user_rating_stats
        +--------+---------+----------------------+
        | rating | user_id |     rated_movie_ids  |
        +--------+---------+----------------------+
        |   3    |  31434  | array('d', [1663.0]) |
        |   5    |  25944  | array('d', [1663.0]) |
        |   4    |  38827  | array('d', [1663.0]) |
        |   4    |  51437  | array('d', [1663.0]) |
        |   4    |  42549  | array('d', [1663.0]) |
        |   4    |  49532  | array('d', [1663.0]) |
        |   3    |  26124  | array('d', [1663.0]) |
        |   4    |  46336  | array('d', [1663.0]) |
        |   4    |  52133  | array('d', [1663.0]) |
        |   5    |  62361  | array('d', [1663.0]) |
        |  ...   |   ...   |         ...          |
        +--------+---------+----------------------+
        [9952 rows x 3 columns]

        To put all items and rating of a given user together into a dictionary
        value:

        >>> user_rating_stats = sf.groupby("user_id",
        ...                                {"movie_rating":agg.CONCAT("movie_id", "rating")})
        >>> user_rating_stats
        +---------+--------------+
        | user_id | movie_rating |
        +---------+--------------+
        |  62361  |  {1663: 5}   |
        |  30727  |  {1663: 4}   |
        |  40111  |  {1663: 2}   |
        |  50513  |  {1663: 4}   |
        |  35140  |  {1663: 4}   |
        |  42352  |  {1663: 5}   |
        |  29667  |  {1663: 4}   |
        |  46242  |  {1663: 5}   |
        |  58310  |  {1663: 2}   |
        |  64614  |  {1663: 2}   |
        |   ...   |     ...      |
        +---------+--------------+
        [9852 rows x 2 columns]
        """
        # some basic checking first
        # make sure key_column_names is a list
        if isinstance(key_column_names, str):
            key_column_names = [key_column_names]
        # check that every column is a string, and is a valid column name
        my_column_names = self.column_names()
        key_columns_array = []
        for column in key_column_names:
            if not isinstance(column, str):
                raise TypeError("Column name must be a string")
            if column not in my_column_names:
                raise KeyError("Column \"" + column + "\" does not exist in SFrame")
            if self[column].dtype == dict:
                raise TypeError("Cannot group on a dictionary column.")
            key_columns_array.append(column)

        group_output_columns = []
        group_columns = []
        group_ops = []

        all_ops = [operations] + list(args)
        for op_entry in all_ops:
            # if it is not a dict, nor a list, it is just a single aggregator
            # element (probably COUNT). wrap it in a list so we can reuse the
            # list processing code
            operation = op_entry
            if not (isinstance(operation, list) or isinstance(operation, dict)):
                operation = [operation]
            if isinstance(operation, dict):
                # now sweep the dict and add to group_columns and group_ops
                for key in operation:
                    val = operation[key]
                    if type(val) is tuple:
                        (op, column) = val
                        if op == "__builtin__avg__" and self[column[0]].dtype in [
                            array.array,
                            numpy.ndarray,
                        ]:
                            op = "__builtin__vector__avg__"

                        if op == "__builtin__sum__" and self[column[0]].dtype in [
                            array.array,
                            numpy.ndarray,
                        ]:
                            op = "__builtin__vector__sum__"

                        if (
                            op == "__builtin__argmax__" or op == "__builtin__argmin__"
                        ) and ((type(column[0]) is tuple) != (type(key) is tuple)):
                            raise TypeError(
                                "Output column(s) and aggregate column(s) for aggregate operation should be either all tuple or all string."
                            )

                        if (
                            op == "__builtin__argmax__" or op == "__builtin__argmin__"
                        ) and type(column[0]) is tuple:
                            for (col, output) in zip(column[0], key):
                                group_columns = group_columns + [[col, column[1]]]
                                group_ops = group_ops + [op]
                                group_output_columns = group_output_columns + [output]
                        else:
                            group_columns = group_columns + [column]
                            group_ops = group_ops + [op]
                            group_output_columns = group_output_columns + [key]

                        if op == "__builtin__concat__dict__":
                            key_column = column[0]
                            key_column_type = self.select_column(key_column).dtype
                            if not key_column_type in (int, float, str):
                                raise TypeError(
                                    "CONCAT key column must be int, float or str type"
                                )

                    elif val == aggregate.COUNT:
                        group_output_columns = group_output_columns + [key]
                        val = aggregate.COUNT()
                        (op, column) = val
                        group_columns = group_columns + [column]
                        group_ops = group_ops + [op]
                    else:
                        raise TypeError(
                            "Unexpected type in aggregator definition of output column: "
                            + key
                        )
            elif isinstance(operation, list):
                # we will be using automatically defined column names
                for val in operation:
                    if type(val) is tuple:
                        (op, column) = val
                        if op == "__builtin__avg__" and self[column[0]].dtype in [
                            array.array,
                            numpy.ndarray,
                        ]:
                            op = "__builtin__vector__avg__"

                        if op == "__builtin__sum__" and self[column[0]].dtype in [
                            array.array,
                            numpy.ndarray,
                        ]:
                            op = "__builtin__vector__sum__"

                        if (
                            op == "__builtin__argmax__" or op == "__builtin__argmin__"
                        ) and type(column[0]) is tuple:
                            for col in column[0]:
                                group_columns = group_columns + [[col, column[1]]]
                                group_ops = group_ops + [op]
                                group_output_columns = group_output_columns + [""]
                        else:
                            group_columns = group_columns + [column]
                            group_ops = group_ops + [op]
                            group_output_columns = group_output_columns + [""]

                        if op == "__builtin__concat__dict__":
                            key_column = column[0]
                            key_column_type = self.select_column(key_column).dtype
                            if not key_column_type in (int, float, str):
                                raise TypeError(
                                    "CONCAT key column must be int, float or str type"
                                )

                    elif val == aggregate.COUNT:
                        group_output_columns = group_output_columns + [""]
                        val = aggregate.COUNT()
                        (op, column) = val
                        group_columns = group_columns + [column]
                        group_ops = group_ops + [op]
                    else:
                        raise TypeError("Unexpected type in aggregator definition.")

        # let's validate group_columns and group_ops are valid
        for (cols, op) in zip(group_columns, group_ops):
            for col in cols:
                if not isinstance(col, str):
                    raise TypeError("Column name must be a string")

            if not isinstance(op, str):
                raise TypeError("Operation type not recognized.")

            if op is not aggregate.COUNT()[0]:
                for col in cols:
                    if col not in my_column_names:
                        raise KeyError("Column " + col + " does not exist in SFrame")

        with cython_context():
            return SFrame(
                _proxy=self.__proxy__.groupby_aggregate(
                    key_columns_array, group_columns, group_output_columns, group_ops
                )
            )