perfkitbenchmarker/regex_util.py (58 lines of code) (raw):
# Copyright 2014 PerfKitBenchmarker Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities for extracting benchmark results using regular expression."""
import re
from typing import Union
_IPV4_REGEX = r'[0-9]+(?:\.[0-9]+){3}'
# From https://docs.python.org/2/library/re.html#simulating-scanf.
FLOAT_REGEX = r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?'
class NoMatchError(ValueError):
"""Raised when no matches for a regex are found within a string."""
pass
class TooManyMatchesError(ValueError):
"""Raised when a regex matches a string more times than expected."""
pass
def ExtractGroup(regex, text, group=1, flags=0):
"""Extracts a string from a regular expression matched to 'text'.
Args:
regex: string or regexp pattern. Regular expression.
text: string. Text to search.
group: int. Group containing a floating point value. Use '0' for the whole
string.
flags: int. Flags to pass to re.search().
Returns:
A string matched by 'regex' on 'text'.
Raises:
NoMatchError: when 'regex' does not match 'text'.
IndexError: when 'group' is not present in the match.
"""
match = re.search(regex, text, flags=flags)
if not match:
raise NoMatchError(
'No match for pattern "{}" in "{}"'.format(regex, text)
)
try:
return match.group(group)
except IndexError as e:
raise IndexError('No such group {} in "{}".'.format(group, regex)) from e
def ExtractFloat(regex, text, group=1, flags=0):
"""Extracts a float from a regular expression matched to 'text'."""
return float(ExtractGroup(regex, text, group=group, flags=flags))
def ExtractInt(regex, text, group=1):
"""Extracts an int from a regular expression matched to 'text'."""
return int(ExtractGroup(regex, text, group=group))
def ExtractAllFloatMetrics(
text, metric_regex=r'\w+', value_regex=FLOAT_REGEX, delimiter_regex='='
):
"""Extracts metrics and their values into a dict.
Args:
text: The text to parse to find metric and values.
metric_regex: A regular expression to find metric names. The metric regex
should not contain any parenthesized groups.
value_regex: A regular expression to find float values. By default, this
works well for floating-point numbers found via scanf.
delimiter_regex: A regular expression between the metric name and value.
Returns:
A dict mapping metrics to values.
"""
if '(' in metric_regex:
raise NotImplementedError(
'ExtractAllFloatMetrics does not support a metric regex with groups.'
)
matches = re.findall(
'(%s)%s(%s)' % (metric_regex, delimiter_regex, value_regex), text
)
return {match[0]: float(match[1]) for match in matches}
def ExtractIpv4Addresses(text):
"""Extracts all ipv4 addresses within 'text'.
Args:
text: string. Text to search.
Returns:
A list of ipv4 strings.
Raises:
NoMatchError: when no ipv4 address is found.
"""
match = re.findall(_IPV4_REGEX, text)
if not match:
raise NoMatchError('No match for ipv4 addresses in "{}"'.format(text))
return match
def ExtractAllMatches(regex: Union[str, re.Pattern[str]], text, flags=0):
"""Extracts all matches from a regular expression matched within 'text'.
Extracts all matches from a regular expression matched within 'text'. Please
note that this function will return a list of strings if regex does not
contain any capturing groups, matching the behavior of re.findall:
>>> re.findall(r'bar', 'foo foo bar foo bar foo')
['bar', 'bar']
Args:
regex: string. Regular expression.
text: string. Text to search.
flags: int. Flags to pass to re.findall().
Returns:
A list of tuples of strings that matched by 'regex' within 'text'.
Raises:
NoMatchError: when 'regex' does not match 'text'.
"""
match = re.findall(regex, text, flags=flags)
if not match:
raise NoMatchError(
'No match for pattern "{}" in "{}"'.format(regex, text)
)
return match
def ExtractExactlyOneMatch(regex, text):
"""Extracts exactly one match of a regular expression from 'text'.
Args:
regex: string. Regular expression, possibly with capturing group.
text: string. The text to search.
Returns:
The contents of the capturing group in the regex. If no capturing
group is present, the text that matched the expression.
Raises:
NoMatchError: if 'regex' does not match 'text'.
TooManyMatchesError: if 'regex' matches 'text' more than once.
"""
matches = ExtractAllMatches(regex, text)
if len(matches) > 1:
raise TooManyMatchesError(
'Pattern "{}" matched "{}" non-uniquely.'.format(regex, text)
)
return matches[0]
def Substitute(pattern, repl, text):
"""Substitute all 'pattern' in 'text' with 'repl'.
Args:
pattern: string. Pattern to be replaced.
repl: string. Replacement pattern.
text: string. Text to search.
Returns:
A string after replacing all patterns with repl.
Raises:
NoMatchError: when 'pattern' isn't found in string.
"""
if not re.search(pattern, text):
raise NoMatchError(
'No match for pattern "{}" in "{}"'.format(pattern, text)
)
return re.sub(pattern, repl, text)