perfkitbenchmarker/regex_util.py (58 lines of code) (raw):

# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Utilities for extracting benchmark results using regular expression.""" import re from typing import Union _IPV4_REGEX = r'[0-9]+(?:\.[0-9]+){3}' # From https://docs.python.org/2/library/re.html#simulating-scanf. FLOAT_REGEX = r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?' class NoMatchError(ValueError): """Raised when no matches for a regex are found within a string.""" pass class TooManyMatchesError(ValueError): """Raised when a regex matches a string more times than expected.""" pass def ExtractGroup(regex, text, group=1, flags=0): """Extracts a string from a regular expression matched to 'text'. Args: regex: string or regexp pattern. Regular expression. text: string. Text to search. group: int. Group containing a floating point value. Use '0' for the whole string. flags: int. Flags to pass to re.search(). Returns: A string matched by 'regex' on 'text'. Raises: NoMatchError: when 'regex' does not match 'text'. IndexError: when 'group' is not present in the match. """ match = re.search(regex, text, flags=flags) if not match: raise NoMatchError( 'No match for pattern "{}" in "{}"'.format(regex, text) ) try: return match.group(group) except IndexError as e: raise IndexError('No such group {} in "{}".'.format(group, regex)) from e def ExtractFloat(regex, text, group=1, flags=0): """Extracts a float from a regular expression matched to 'text'.""" return float(ExtractGroup(regex, text, group=group, flags=flags)) def ExtractInt(regex, text, group=1): """Extracts an int from a regular expression matched to 'text'.""" return int(ExtractGroup(regex, text, group=group)) def ExtractAllFloatMetrics( text, metric_regex=r'\w+', value_regex=FLOAT_REGEX, delimiter_regex='=' ): """Extracts metrics and their values into a dict. Args: text: The text to parse to find metric and values. metric_regex: A regular expression to find metric names. The metric regex should not contain any parenthesized groups. value_regex: A regular expression to find float values. By default, this works well for floating-point numbers found via scanf. delimiter_regex: A regular expression between the metric name and value. Returns: A dict mapping metrics to values. """ if '(' in metric_regex: raise NotImplementedError( 'ExtractAllFloatMetrics does not support a metric regex with groups.' ) matches = re.findall( '(%s)%s(%s)' % (metric_regex, delimiter_regex, value_regex), text ) return {match[0]: float(match[1]) for match in matches} def ExtractIpv4Addresses(text): """Extracts all ipv4 addresses within 'text'. Args: text: string. Text to search. Returns: A list of ipv4 strings. Raises: NoMatchError: when no ipv4 address is found. """ match = re.findall(_IPV4_REGEX, text) if not match: raise NoMatchError('No match for ipv4 addresses in "{}"'.format(text)) return match def ExtractAllMatches(regex: Union[str, re.Pattern[str]], text, flags=0): """Extracts all matches from a regular expression matched within 'text'. Extracts all matches from a regular expression matched within 'text'. Please note that this function will return a list of strings if regex does not contain any capturing groups, matching the behavior of re.findall: >>> re.findall(r'bar', 'foo foo bar foo bar foo') ['bar', 'bar'] Args: regex: string. Regular expression. text: string. Text to search. flags: int. Flags to pass to re.findall(). Returns: A list of tuples of strings that matched by 'regex' within 'text'. Raises: NoMatchError: when 'regex' does not match 'text'. """ match = re.findall(regex, text, flags=flags) if not match: raise NoMatchError( 'No match for pattern "{}" in "{}"'.format(regex, text) ) return match def ExtractExactlyOneMatch(regex, text): """Extracts exactly one match of a regular expression from 'text'. Args: regex: string. Regular expression, possibly with capturing group. text: string. The text to search. Returns: The contents of the capturing group in the regex. If no capturing group is present, the text that matched the expression. Raises: NoMatchError: if 'regex' does not match 'text'. TooManyMatchesError: if 'regex' matches 'text' more than once. """ matches = ExtractAllMatches(regex, text) if len(matches) > 1: raise TooManyMatchesError( 'Pattern "{}" matched "{}" non-uniquely.'.format(regex, text) ) return matches[0] def Substitute(pattern, repl, text): """Substitute all 'pattern' in 'text' with 'repl'. Args: pattern: string. Pattern to be replaced. repl: string. Replacement pattern. text: string. Text to search. Returns: A string after replacing all patterns with repl. Raises: NoMatchError: when 'pattern' isn't found in string. """ if not re.search(pattern, text): raise NoMatchError( 'No match for pattern "{}" in "{}"'.format(pattern, text) ) return re.sub(pattern, repl, text)