def select()

in torcharrow/velox_rt/dataframe_cpu.py [0:0]


    def select(self, *args, **kwargs):
        """
        Analogous to SQL's ``SELECT`.

        Transform a dataframe by selecting old columns and new (computed)
        columns.

        args - positional string arguments
            Column names to keep in the projection. A column name of "*" is a
            shortcut to denote all columns. A column name beginning with "-"
            means remove this column.

        kwargs - named value arguments
            New column name expressions to add to the projection

        The special symbol me can  be used to refer to self.

        Examples
        --------
        >>> from torcharrow import ta
        >>> xf = ta.DataFrame({
        >>>    'A': ['a', 'b', 'a', 'b'],
        >>>    'B': [1, 2, 3, 4],
        >>>    'C': [10,11,12,13]})
        >>> xf.select(*xf.columns,D=me['B']+me['C'])
          index  A      B    C    D
        -------  ---  ---  ---  ---
              0  a      1   10   11
              1  b      2   11   13
              2  a      3   12   15
              3  b      4   13   17
        dtype: Struct([Field('A', string), Field('B', int64), Field('C', int64), Field('D', int64)]), count: 4, null_count: 0

        Using '*' and '-colname':

        >>> xf.select('*','-B',D=me['B']+me['C'])
          index  A      C    D
        -------  ---  ---  ---
              0  a     10   11
              1  b     11   13
              2  a     12   15
              3  b     13   17
        dtype: Struct([Field('A', string), Field('C', int64), Field('D', int64)]), count: 4, null_count: 0
        """

        input_columns = set(self.columns)

        has_star = False
        include = []
        exclude = []
        for arg in args:
            if not isinstance(arg, str):
                raise TypeError("args must be column names")
            if arg == "*":
                if has_star:
                    raise ValueError("select received repeated stars")
                has_star = True
            elif arg in input_columns:
                if arg in include:
                    raise ValueError(
                        f"select received a repeated column-include ({arg})"
                    )
                include.append(arg)
            elif arg[0] == "-" and arg[1:] in input_columns:
                if arg in exclude:
                    raise ValueError(
                        f"select received a repeated column-exclude ({arg[1:]})"
                    )
                exclude.append(arg[1:])
            else:
                raise ValueError(f"argument ({arg}) does not denote an existing column")
        if exclude and not has_star:
            raise ValueError("select received column-exclude without a star")
        if has_star and include:
            raise ValueError("select received both a star and column-includes")
        if set(include) & set(exclude):
            raise ValueError(
                "select received overlapping column-includes and " + "column-excludes"
            )

        include_inc_star = self.columns if has_star else include

        output_columns = [col for col in include_inc_star if col not in exclude]

        res = {}
        for i in range(self._data.children_size()):
            n = self.dtype.fields[i].name
            if n in output_columns:
                res[n] = ColumnFromVelox._from_velox(
                    self.device,
                    self.dtype.fields[i].dtype,
                    self._data.child_at(i),
                    True,
                )
        for n, c in kwargs.items():
            res[n] = eval_expression(c, {"me": self})
        return self._fromdata(res, self._mask)