in tfx/components/example_gen/utils.py [0:0]
def _create_matching_glob_and_regex(
uri: str, split: example_gen_pb2.Input.Split, is_match_span: bool,
is_match_date: bool, is_match_version: bool,
range_config: Optional[range_config_pb2.RangeConfig]) -> Tuple[str, str]:
"""Constructs glob and regex patterns for matching span and version.
Construct a glob and regex pattern for matching files and capturing span and
version information. By default, this method replaces the span, date, and
or version specs in the split pattern with wildcard characters to get a
glob pattern and with greedy named capture groups to get a regex pattern.
If a static range `range_config` is specified, this method replaces the span
spec (if `is_match_span`) in both the glob and regex pattern with the span
number corresponding to the provided static range. If a span width modifier
is specified, this substitution is also made with zero padding. Similarly, if
`is_match_date`, the provided span number from the static range is converted
is mapped back into a calendar date, which is then used to replace the date
specs in the glob and regex patterns.
Args:
uri: The base path from which files will be searched.
split: An example_gen_pb2.Input.Split object which contains a split pattern,
to be searched on.
is_match_span: Flag set to True if span spec is present, False otherwise.
is_match_date: Flag set to True if date specs are present, False otherwise.
is_match_version: Flag set to True if version spec is presen, False
otherwise.
range_config: An instance of range_config_pb2.RangeConfig, which specifies
which spans to consider when finding the most recent span and version. If
unset, search for latest span number with no restrictions.
Returns:
Tuple of two strings, first of which is a glob pattern to identify relevant
files for process, the second of which is a regex pattern containing capture
groups, for span, date, and/or version (if their respective matching flags
are set).
"""
split_pattern = os.path.join(uri, split.pattern)
split_glob_pattern = split_pattern
split_regex_pattern = _glob_to_regex(split_pattern)
if is_match_span:
# Check if span spec has any width args. Defaults to greedy matching if
# no width modifiers are present.
span_glob_replace = '*'
span_regex_replace = '.*'
span_width_str = _get_spec_width(SPAN_FULL_REGEX, SPAN_PROPERTY_NAME, split)
if span_width_str:
span_regex_replace = '.{%s}' % span_width_str
if range_config and range_config.HasField('static_range'):
span_str = _get_span_replace_glob_and_regex(range_config, is_match_span,
is_match_date, span_width_str)
span_regex_replace = span_str
span_glob_replace = span_str
split_glob_pattern = re.sub(SPAN_FULL_REGEX, span_glob_replace,
split_glob_pattern)
span_capture_regex = '(?P<{}>{})'.format(SPAN_PROPERTY_NAME,
span_regex_replace)
split_regex_pattern = re.sub(SPAN_FULL_REGEX, span_capture_regex,
split_regex_pattern)
elif is_match_date:
date_glob_replace = ['*', '*', '*']
# Defines a clear number of digits for certain element of date, in order of
# year, month, and day. This covers cases where date stamps may not have
# seperators between them.
date_regex_replace = ['.{4}', '.{2}', '.{2}']
if range_config and range_config.HasField('static_range'):
date_tokens = _get_span_replace_glob_and_regex(range_config,
is_match_span,
is_match_date, None)
date_glob_replace = date_tokens
date_regex_replace = date_tokens
for spec, replace in zip(DATE_SPECS, date_glob_replace):
split_glob_pattern = split_glob_pattern.replace(spec, replace)
for spec, replace, spec_name in zip(DATE_SPECS, date_regex_replace,
['year', 'month', 'day']):
split_regex_pattern = split_regex_pattern.replace(
spec, '(?P<{}>{})'.format(spec_name, replace))
if is_match_version:
# Check if version spec has any width modifier. Defaults to greedy matching
# if no width modifiers are present.
version_width_regex = '.*'
version_width_str = _get_spec_width(VERSION_FULL_REGEX,
VERSION_PROPERTY_NAME, split)
if version_width_str:
version_width_regex = '.{%s}' % version_width_str
split_glob_pattern = re.sub(VERSION_FULL_REGEX, '*', split_glob_pattern)
version_capture_regex = '(?P<{}>{})'.format(VERSION_PROPERTY_NAME,
version_width_regex)
split_regex_pattern = re.sub(VERSION_FULL_REGEX, version_capture_regex,
split_regex_pattern)
return split_glob_pattern, split_regex_pattern