in torcharrow/velox_rt/dataframe_cpu.py [0:0]
def select(self, *args, **kwargs):
"""
Analogous to SQL's ``SELECT`.
Transform a dataframe by selecting old columns and new (computed)
columns.
args - positional string arguments
Column names to keep in the projection. A column name of "*" is a
shortcut to denote all columns. A column name beginning with "-"
means remove this column.
kwargs - named value arguments
New column name expressions to add to the projection
The special symbol me can be used to refer to self.
Examples
--------
>>> from torcharrow import ta
>>> xf = ta.DataFrame({
>>> 'A': ['a', 'b', 'a', 'b'],
>>> 'B': [1, 2, 3, 4],
>>> 'C': [10,11,12,13]})
>>> xf.select(*xf.columns,D=me['B']+me['C'])
index A B C D
------- --- --- --- ---
0 a 1 10 11
1 b 2 11 13
2 a 3 12 15
3 b 4 13 17
dtype: Struct([Field('A', string), Field('B', int64), Field('C', int64), Field('D', int64)]), count: 4, null_count: 0
Using '*' and '-colname':
>>> xf.select('*','-B',D=me['B']+me['C'])
index A C D
------- --- --- ---
0 a 10 11
1 b 11 13
2 a 12 15
3 b 13 17
dtype: Struct([Field('A', string), Field('C', int64), Field('D', int64)]), count: 4, null_count: 0
"""
input_columns = set(self.columns)
has_star = False
include = []
exclude = []
for arg in args:
if not isinstance(arg, str):
raise TypeError("args must be column names")
if arg == "*":
if has_star:
raise ValueError("select received repeated stars")
has_star = True
elif arg in input_columns:
if arg in include:
raise ValueError(
f"select received a repeated column-include ({arg})"
)
include.append(arg)
elif arg[0] == "-" and arg[1:] in input_columns:
if arg in exclude:
raise ValueError(
f"select received a repeated column-exclude ({arg[1:]})"
)
exclude.append(arg[1:])
else:
raise ValueError(f"argument ({arg}) does not denote an existing column")
if exclude and not has_star:
raise ValueError("select received column-exclude without a star")
if has_star and include:
raise ValueError("select received both a star and column-includes")
if set(include) & set(exclude):
raise ValueError(
"select received overlapping column-includes and " + "column-excludes"
)
include_inc_star = self.columns if has_star else include
output_columns = [col for col in include_inc_star if col not in exclude]
res = {}
for i in range(self._data.children_size()):
n = self.dtype.fields[i].name
if n in output_columns:
res[n] = ColumnFromVelox._from_velox(
self.device,
self.dtype.fields[i].dtype,
self._data.child_at(i),
True,
)
for n, c in kwargs.items():
res[n] = eval_expression(c, {"me": self})
return self._fromdata(res, self._mask)