odps/df/expr/strings.py (318 lines of code) (raw):

#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright 1999-2022 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .element import ElementWise from .expressions import Expr, StringSequenceExpr, Scalar, StringScalar, SequenceExpr from . import utils from .. import types from ...compat import six class StringOp(ElementWise): __slots__ = () def _init(self, *args, **kwargs): for arg in self._args[1:]: self._init_attr(arg, None) super(StringOp, self)._init(*args, **kwargs) for attr in self._args[1:]: val = getattr(self, attr) if val is not None and not isinstance(val, (Expr, list, tuple)): setattr(self, attr, Scalar(_value=val)) def __getattribute__(self, attr): if attr in ('input', '_input'): return super(StringOp, self).__getattribute__(attr) else: try: return object.__getattribute__(self, attr) except AttributeError as e: err = e if not attr.startswith('_'): private_attr = '_%s' % attr try: scalar = object.__getattribute__(self, private_attr) if isinstance(scalar, Scalar): return scalar.value return scalar except AttributeError: raise err def accept(self, visitor): return visitor.visit_string_op(self) class Capitalize(StringOp): __slots__ = () class CatStr(StringOp): _args = '_input', '_others', '_sep', '_na_rep' _add_args_slots = False @property def node_name(self): return 'Cat' class Contains(StringOp): _args = '_input', '_pat', '_case', '_flags', '_regex' _add_args_slots = False class Count(StringOp): _args = '_input', '_pat', '_flags' _add_args_slots = False class Endswith(StringOp): _args = '_input', '_pat' _add_args_slots = False class Startswith(StringOp): _args = '_input', '_pat' _add_args_slots = False class Extract(StringOp): _args = '_input', '_pat', '_flags', '_group' _add_args_slots = False class Find(StringOp): _args = '_input', '_sub', '_start', '_end' _add_args_slots = False class RFind(StringOp): _args = '_input', '_sub', '_start', '_end' _add_args_slots = False class Replace(StringOp): _args = '_input', '_pat', '_repl', '_n', '_case', '_flags', '_regex' _add_args_slots = False class Get(StringOp): _args = '_input', '_index' _add_args_slots = False class Join(StringOp): _args = '_input', '_sep' _add_args_slots = False class Len(StringOp): _args = '_input', _add_args_slots = False class Ljust(StringOp): _args = '_input', '_width', '_fillchar' _add_args_slots = False class Rjust(StringOp): _args = '_input', '_width', '_fillchar' _add_args_slots = False class Lower(StringOp): _args = '_input', _add_args_slots = False class Upper(StringOp): _args = '_input', _add_args_slots = False class Lstrip(StringOp): _args = '_input', '_to_strip' _add_args_slots = False class Rstrip(StringOp): _args = '_input', '_to_strip' _add_args_slots = False class Strip(StringOp): _args = '_input', '_to_strip' _add_args_slots = False class Pad(StringOp): _args = '_input', '_width', '_side', '_fillchar' _add_args_slots = False def _init(self, *args, **kwargs): super(Pad, self)._init(*args, **kwargs) if self.side not in ('left', 'right', 'both'): raise ValueError('Side should be left, right or both') class Repeat(StringOp): _args = '_input', '_repeats' _add_args_slots = False class Split(StringOp): _args = '_input', '_pat', '_n' _add_args_slots = False class RSplit(StringOp): _args = '_input', '_pat', '_n' _add_args_slots = False class Slice(StringOp): _args = '_input', '_start', '_end', '_step' _add_args_slots = False class Swapcase(StringOp): _args = '_input', _add_args_slots = False class Title(StringOp): _args = '_input', _add_args_slots = False class Zfill(StringOp): _args = '_input', '_width' _add_args_slots = False class Strptime(StringOp): _args = '_input', '_date_format' _add_args_slots = False class Isalnum(StringOp): _args = '_input', _add_args_slots = False class Isalpha(StringOp): _args = '_input', _add_args_slots = False class Isdigit(StringOp): _args = '_input', _add_args_slots = False class Isspace(StringOp): _args = '_input', _add_args_slots = False class Islower(StringOp): _args = '_input', _add_args_slots = False class Isupper(StringOp): _args = '_input', _add_args_slots = False class Istitle(StringOp): _args = '_input', _add_args_slots = False class Isnumeric(StringOp): _args = '_input', _add_args_slots = False class Isdecimal(StringOp): _args = '_input', _add_args_slots = False class StringToDict(StringOp): _args = '_input', '_item_delim', '_kv_delim' _add_args_slots = False def _string_op(expr, output_expr_cls, output_type=None, **kwargs): output_type = output_type or types.string if isinstance(expr, (StringSequenceExpr, StringScalar)): is_sequence = isinstance(expr, StringSequenceExpr) if is_sequence: return output_expr_cls(_data_type=output_type, _input=expr, **kwargs) else: return output_expr_cls(_value_type=output_type, _input=expr, **kwargs) def _capitalize(expr): """ Convert strings in the Sequence or string scalar to be capitalized. Equivalent to str.capitalize(). :param expr: :return: sequence or scalar """ return _string_op(expr, Capitalize) def _cat(expr, others, sep=None, na_rep=None): if isinstance(others, six.string_types): raise ValueError('Did you mean to supply a `sep` keyword?') return _string_op(expr, CatStr, _others=others, _sep=sep, _na_rep=na_rep) def _contains(expr, pat, case=True, flags=0, regex=True): """ Return boolean sequence whether given pattern/regex is contained in each string in the sequence :param expr: sequence or scalar :param pat: Character sequence or regular expression :param case: If True, case sensitive :type case: bool :param flags: re module flags, e.g. re.IGNORECASE :param regex: If True use regex, otherwise use string finder :return: sequence or scalar """ if regex and isinstance(pat, six.string_types): import re try: re.compile(pat, flags=flags) except: raise ValueError('Failed to compile regular expression, ' 'please check re.compile("{0}")'.format(pat)) return _string_op(expr, Contains, output_type=types.boolean, _pat=pat, _case=case, _flags=flags, _regex=regex) def _count(expr, pat, flags=0): """ Count occurrences of pattern in each string of the sequence or scalar :param expr: sequence or scalar :param pat: valid regular expression :param flags: re module flags, e.g. re.IGNORECASE :return: """ return _string_op(expr, Count, output_type=types.int64, _pat=pat, _flags=flags) def _endswith(expr, pat): """ Return boolean sequence or scalar indicating whether each string in the sequence or scalar ends with passed pattern. Equivalent to str.endswith(). :param expr: :param pat: Character sequence :return: sequence or scalar """ return _string_op(expr, Endswith, output_type=types.boolean, _pat=pat) def _startswith(expr, pat): """ Return boolean sequence or scalar indicating whether each string in the sequence or scalar starts with passed pattern. Equivalent to str.startswith(). :param expr: :param pat: Character sequence :return: sequence or scalar """ return _string_op(expr, Startswith, output_type=types.boolean, _pat=pat) def _extract(expr, pat, flags=0, group=0): """ Find group in each string in the Series using passed regular expression. :param expr: :param pat: Pattern or regular expression :param flags: re module, e.g. re.IGNORECASE :param group: if None as group 0 :return: sequence or scalar """ return _string_op(expr, Extract, _pat=pat, _flags=flags, _group=group) def _find(expr, sub, start=0, end=None): """ Return lowest indexes in each strings in the sequence or scalar where the substring is fully contained between [start:end]. Return -1 on failure. Equivalent to standard str.find(). :param expr: :param sub: substring being searched :param start: left edge index :param end: right edge index :return: sequence or scalar """ return _string_op(expr, Find, output_type=types.int64, _sub=sub, _start=start, _end=end) def _rfind(expr, sub, start=0, end=None): """ Return highest indexes in each strings in the sequence or scalar where the substring is fully contained between [start:end]. Return -1 on failure. Equivalent to standard str.rfind(). :param expr: :param sub: :param start: :param end: :return: sequence or scalar """ return _string_op(expr, RFind, output_type=types.int64, _sub=sub, _start=start, _end=end) def _replace(expr, pat, repl, n=-1, case=True, flags=0, regex=True): """ Replace occurrence of pattern/regex in the sequence or scalar with some other string. Equivalent to str.replace() :param expr: :param pat: Character sequence or regular expression :param repl: Replacement :param n: Number of replacements to make from start :param case: if True, case sensitive :param flags: re module flag, e.g. re.IGNORECASE :return: sequence or scalar """ return _string_op(expr, Replace, _pat=pat, _repl=repl, _n=n, _case=case, _flags=flags, _regex=regex) def _get(expr, index): """ Extract element from lists, tuples, or strings in each element in the sequence or scalar :param expr: :param index: Integer index(location) :return: sequence or scalar """ return _string_op(expr, Get, _index=index) def _join(expr, sep): """ Join lists contained as elements in the Series/Index with passed delimiter. Equivalent to str.join(). :param expr: :param sep: Delimiter :return: sequence or scalar """ return _string_op(expr, Join, _sep=sep) def _len(expr): """ Compute length of each string in the sequence or scalar :param expr: :return: lengths """ return _string_op(expr, Len, output_type=types.int64) def _ljust(expr, width, fillchar=' '): """ Filling right side of strings in the sequence or scalar with an additional character. Equivalent to str.ljust(). :param expr: :param width: Minimum width of resulting string; additional characters will be filled with `fillchar` :param fillchar: Additional character for filling, default is whitespace. :return: sequence or scalar """ return _string_op(expr, Ljust, _width=width, _fillchar=fillchar) def _rjust(expr, width, fillchar=' '): """ Filling left side of strings in the sequence or scalar with an additional character. Equivalent to str.rjust(). :param expr: :param width: Minimum width of resulting string; additional characters will be filled with `fillchar` :param fillchar: Additional character for filling, default is whitespace. :return: sequence or scalar """ return _string_op(expr, Rjust, _width=width, _fillchar=fillchar) def _lower(expr): """ Convert strings in the sequence or scalar lowercase. Equivalent to str.lower(). :param expr: :return: sequence or scalar """ return _string_op(expr, Lower) def _upper(expr): """ Convert strings in the sequence or scalar uppercase. Equivalent to str.upper(). :param expr: :return: sequence or scalar """ return _string_op(expr, Upper) def _lstrip(expr, to_strip=None): """ Strip whitespace (including newlines) from each string in the sequence or scalar from left side. Equivalent to str.lstrip(). :param expr: :param to_strip: :return: sequence or sclaar """ return _string_op(expr, Lstrip, _to_strip=to_strip) def _rstrip(expr, to_strip=None): """ Strip whitespace (including newlines) from each string in the sequence or scalar from right side. Equivalent to str.rstrip(). :param expr: :param to_strip: :return: sequence or scalar """ return _string_op(expr, Rstrip, _to_strip=to_strip) def _split(expr, pat=None, n=-1): """ Split each string (a la re.split) in the Series/Index by given pattern, propagating NA values. Equivalent to str.split(). :param expr: :param pat: Separator to split on. If None, splits on whitespace :param n: not supported right now :return: list sequence or scalar """ return _string_op(expr, Split, output_type=types.List(types.string), _pat=pat, _n=n) def _rsplit(expr, pat=None, n=-1): """ Split each string in the Series/Index by the given delimiter string, starting at the end of the string and working to the front. Equivalent to str.rsplit(). :param expr: :param pat: Separator to split on. If None, splits on whitespace :param n: None, 0 and -1 will be interpreted as return all splits :return: sequence or scalar """ return _string_op(expr, RSplit, output_type=types.List(types.string), _pat=pat, _n=n) def _strip(expr, to_strip=None): """ Strip whitespace (including newlines) from each string in the sequence or scalar from left and right sides. Equivalent to str.strip(). :param expr: :param to_strip: :return: sequence or scalar """ return _string_op(expr, Strip, _to_strip=to_strip) def _pad(expr, width, side='left', fillchar=' '): """ Pad strings in the sequence or scalar with an additional character to specified side. :param expr: :param width: Minimum width of resulting string; additional characters will be filled with spaces :param side: {‘left’, ‘right’, ‘both’}, default ‘left’ :param fillchar: Additional character for filling, default is whitespace :return: sequence or scalar """ if not isinstance(fillchar, six.string_types): msg = 'fillchar must be a character, not {0}' raise TypeError(msg.format(type(fillchar).__name__)) if len(fillchar) != 1: raise TypeError('fillchar must be a character, not str') if side not in ('left', 'right', 'both'): raise ValueError('Invalid side') return _string_op(expr, Pad, _width=width, _side=side, _fillchar=fillchar) def _repeat(expr, repeats): """ Duplicate each string in the sequence or scalar by indicated number of times. :param expr: :param repeats: times :return: sequence or scalar """ return _string_op(expr, Repeat, _repeats=repeats) def _slice(expr, start=None, stop=None, step=None): """ Slice substrings from each element in the sequence or scalar :param expr: :param start: int or None :param stop: int or None :param step: int or None :return: sliced """ return _string_op(expr, Slice, _start=start, _end=stop, _step=step) def _getitem(expr, item): if isinstance(item, six.integer_types) or \ (isinstance(item, (SequenceExpr, Scalar)) and isinstance(item.dtype, types.Integer)): return _get(expr, item) elif isinstance(item, slice): return _slice(expr, start=item.start, stop=item.stop, step=item.step) else: raise TypeError('Unknown argument: %r' % item) def _swapcase(expr): """ Convert strings in the sequence or scalar to be swapcased. Equivalent to str.swapcase(). :param expr: :return: converted """ return _string_op(expr, Swapcase) def _title(expr): """ Convert strings in the sequence or scalar to titlecase. Equivalent to str.title(). :param expr: :return: converted """ return _string_op(expr, Title) def _zfill(expr, width): """ Filling left side of strings in the sequence or scalar with 0. Equivalent to str.zfill(). :param expr: :param width: Minimum width of resulting string; additional characters will be filled with 0 :return: filled """ return _string_op(expr, Zfill, _width=width) def _strptime(expr, date_format): """ Return datetimes specified by date_format, which supports the same string format as the python standard library. Details of the string format can be found in python string format doc :param expr: :param date_format: date format string (e.g. “%Y-%m-%d”) :type date_format: str :return: """ return _string_op(expr, Strptime, _date_format=date_format, output_type=types.datetime) def _isalnum(expr): """ Check whether all characters in each string in the sequence or scalar are alphanumeric. Equivalent to str.isalnum(). :param expr: :return: boolean sequence or scalar """ return _string_op(expr, Isalnum, output_type=types.boolean) def _isalpha(expr): """ Check whether all characters in each string in the sequence or scalar are alphabetic. Equivalent to str.isalpha(). :param expr: :return: boolean sequence or scalar """ return _string_op(expr, Isalpha, output_type=types.boolean) def _isdigit(expr): """ Check whether all characters in each string in the sequence or scalar are digits. Equivalent to str.isdigit(). :param expr: :return: boolean sequence or scalar """ return _string_op(expr, Isdigit, output_type=types.boolean) def _isspace(expr): """ Check whether all characters in each string in the sequence or scalar are whitespace. Equivalent to str.isspace(). :param expr: :return: boolean sequence or scalar """ return _string_op(expr, Isspace, output_type=types.boolean) def _islower(expr): """ Check whether all characters in each string in the sequence or scalar are lowercase. Equivalent to str.islower(). :param expr: :return: boolean sequence or scalar """ return _string_op(expr, Islower, output_type=types.boolean) def _isupper(expr): """ Check whether all characters in each string in the sequence or scalar are uppercase. Equivalent to str.isupper(). :param expr: :return: boolean sequence or scalar """ return _string_op(expr, Isupper, output_type=types.boolean) def _istitle(expr): """ Check whether all characters in each string in the sequence or scalar are titlecase. Equivalent to str.istitle(). :param expr: :return: boolean sequence or scalar """ return _string_op(expr, Istitle, output_type=types.boolean) def _isnumeric(expr): """ Check whether all characters in each string in the sequence or scalar are numeric. Equivalent to str.isnumeric(). :param expr: :return: boolean sequence or scalar """ return _string_op(expr, Isnumeric, output_type=types.boolean) def _isdecimal(expr): """ Check whether all characters in each string in the sequence or scalar are decimal. Equivalent to str.isdecimal(). :param expr: :return: boolean sequence or scalar """ return _string_op(expr, Isdecimal, output_type=types.boolean) def _todict(expr, item_delim=',', kv_delim='='): """ Convert the string sequence / expr into a string dict given item and key-value delimiters. :param expr: :param item_delim: delimiter between data items :param kv_delim: delimiter between keys and values :return: dict sequence or scalar """ return _string_op(expr, StringToDict, _item_delim=item_delim, _kv_delim=kv_delim, output_type=types.Dict(types.string, types.string)) _string_methods = dict( capitalize=_capitalize, contains=_contains, count=_count, endswith=_endswith, startswith=_startswith, extract=_extract, find=_find, rfind=_rfind, replace=_replace, get=_get, len=_len, ljust=_ljust, rjust=_rjust, lower=_lower, upper=_upper, lstrip=_lstrip, rstrip=_rstrip, split=_split, strip=_strip, pad=_pad, repeat=_repeat, slice=_slice, __getitem__=_getitem, swapcase=_swapcase, title=_title, zfill=_zfill, strptime=_strptime, isalnum=_isalnum, isalpha=_isalpha, isdigit=_isdigit, isspace=_isspace, islower=_islower, isupper=_isupper, istitle=_istitle, isnumeric=_isnumeric, isdecimal=_isdecimal, todict=_todict, ) utils.add_method(StringSequenceExpr, _string_methods) utils.add_method(StringScalar, _string_methods) utils.add_method(StringScalar, {'cat': _cat})