in Lib/sre_parse.py [0:0]
def _parse(source, state, verbose, nested, first=False):
# parse a simple pattern
subpattern = SubPattern(state)
# precompute constants into local variables
subpatternappend = subpattern.append
sourceget = source.get
sourcematch = source.match
_len = len
_ord = ord
while True:
this = source.next
if this is None:
break # end of pattern
if this in "|)":
break # end of subpattern
sourceget()
if verbose:
# skip whitespace and comments
if this in WHITESPACE:
continue
if this == "#":
while True:
this = sourceget()
if this is None or this == "\n":
break
continue
if this[0] == "\\":
code = _escape(source, this, state)
subpatternappend(code)
elif this not in SPECIAL_CHARS:
subpatternappend((LITERAL, _ord(this)))
elif this == "[":
here = source.tell() - 1
# character set
set = []
setappend = set.append
## if sourcematch(":"):
## pass # handle character classes
if source.next == '[':
import warnings
warnings.warn(
'Possible nested set at position %d' % source.tell(),
FutureWarning, stacklevel=nested + 6
)
negate = sourcematch("^")
# check remaining characters
while True:
this = sourceget()
if this is None:
raise source.error("unterminated character set",
source.tell() - here)
if this == "]" and set:
break
elif this[0] == "\\":
code1 = _class_escape(source, this)
else:
if set and this in '-&~|' and source.next == this:
import warnings
warnings.warn(
'Possible set %s at position %d' % (
'difference' if this == '-' else
'intersection' if this == '&' else
'symmetric difference' if this == '~' else
'union',
source.tell() - 1),
FutureWarning, stacklevel=nested + 6
)
code1 = LITERAL, _ord(this)
if sourcematch("-"):
# potential range
that = sourceget()
if that is None:
raise source.error("unterminated character set",
source.tell() - here)
if that == "]":
if code1[0] is IN:
code1 = code1[1][0]
setappend(code1)
setappend((LITERAL, _ord("-")))
break
if that[0] == "\\":
code2 = _class_escape(source, that)
else:
if that == '-':
import warnings
warnings.warn(
'Possible set difference at position %d' % (
source.tell() - 2),
FutureWarning, stacklevel=nested + 6
)
code2 = LITERAL, _ord(that)
if code1[0] != LITERAL or code2[0] != LITERAL:
msg = "bad character range %s-%s" % (this, that)
raise source.error(msg, len(this) + 1 + len(that))
lo = code1[1]
hi = code2[1]
if hi < lo:
msg = "bad character range %s-%s" % (this, that)
raise source.error(msg, len(this) + 1 + len(that))
setappend((RANGE, (lo, hi)))
else:
if code1[0] is IN:
code1 = code1[1][0]
setappend(code1)
set = _uniq(set)
# XXX: <fl> should move set optimization to compiler!
if _len(set) == 1 and set[0][0] is LITERAL:
# optimization
if negate:
subpatternappend((NOT_LITERAL, set[0][1]))
else:
subpatternappend(set[0])
else:
if negate:
set.insert(0, (NEGATE, None))
# charmap optimization can't be added here because
# global flags still are not known
subpatternappend((IN, set))
elif this in REPEAT_CHARS:
# repeat previous item
here = source.tell()
if this == "?":
min, max = 0, 1
elif this == "*":
min, max = 0, MAXREPEAT
elif this == "+":
min, max = 1, MAXREPEAT
elif this == "{":
if source.next == "}":
subpatternappend((LITERAL, _ord(this)))
continue
min, max = 0, MAXREPEAT
lo = hi = ""
while source.next in DIGITS:
lo += sourceget()
if sourcematch(","):
while source.next in DIGITS:
hi += sourceget()
else:
hi = lo
if not sourcematch("}"):
subpatternappend((LITERAL, _ord(this)))
source.seek(here)
continue
if lo:
min = int(lo)
if min >= MAXREPEAT:
raise OverflowError("the repetition number is too large")
if hi:
max = int(hi)
if max >= MAXREPEAT:
raise OverflowError("the repetition number is too large")
if max < min:
raise source.error("min repeat greater than max repeat",
source.tell() - here)
else:
raise AssertionError("unsupported quantifier %r" % (char,))
# figure out which item to repeat
if subpattern:
item = subpattern[-1:]
else:
item = None
if not item or item[0][0] is AT:
raise source.error("nothing to repeat",
source.tell() - here + len(this))
if item[0][0] in _REPEATCODES:
raise source.error("multiple repeat",
source.tell() - here + len(this))
if item[0][0] is SUBPATTERN:
group, add_flags, del_flags, p = item[0][1]
if group is None and not add_flags and not del_flags:
item = p
if sourcematch("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item))
else:
subpattern[-1] = (MAX_REPEAT, (min, max, item))
elif this == ".":
subpatternappend((ANY, None))
elif this == "(":
start = source.tell() - 1
group = True
name = None
add_flags = 0
del_flags = 0
if sourcematch("?"):
# options
char = sourceget()
if char is None:
raise source.error("unexpected end of pattern")
if char == "P":
# python extensions
if sourcematch("<"):
# named group: skip forward to end of name
name = source.getuntil(">", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
elif sourcematch("="):
# named backreference
name = source.getuntil(")", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name %r" % name
raise source.error(msg, len(name) + 1)
if not state.checkgroup(gid):
raise source.error("cannot refer to an open group",
len(name) + 1)
state.checklookbehindgroup(gid, source)
subpatternappend((GROUPREF, gid))
continue
else:
char = sourceget()
if char is None:
raise source.error("unexpected end of pattern")
raise source.error("unknown extension ?P" + char,
len(char) + 2)
elif char == ":":
# non-capturing group
group = None
elif char == "#":
# comment
while True:
if source.next is None:
raise source.error("missing ), unterminated comment",
source.tell() - start)
if sourceget() == ")":
break
continue
elif char in "=!<":
# lookahead assertions
dir = 1
if char == "<":
char = sourceget()
if char is None:
raise source.error("unexpected end of pattern")
if char not in "=!":
raise source.error("unknown extension ?<" + char,
len(char) + 2)
dir = -1 # lookbehind
lookbehindgroups = state.lookbehindgroups
if lookbehindgroups is None:
state.lookbehindgroups = state.groups
p = _parse_sub(source, state, verbose, nested + 1)
if dir < 0:
if lookbehindgroups is None:
state.lookbehindgroups = None
if not sourcematch(")"):
raise source.error("missing ), unterminated subpattern",
source.tell() - start)
if char == "=":
subpatternappend((ASSERT, (dir, p)))
else:
subpatternappend((ASSERT_NOT, (dir, p)))
continue
elif char == "(":
# conditional backreference group
condname = source.getuntil(")", "group name")
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name %r" % condname
raise source.error(msg, len(condname) + 1)
else:
try:
condgroup = int(condname)
if condgroup < 0:
raise ValueError
except ValueError:
msg = "bad character in group name %r" % condname
raise source.error(msg, len(condname) + 1) from None
if not condgroup:
raise source.error("bad group number",
len(condname) + 1)
if condgroup >= MAXGROUPS:
msg = "invalid group reference %d" % condgroup
raise source.error(msg, len(condname) + 1)
state.checklookbehindgroup(condgroup, source)
item_yes = _parse(source, state, verbose, nested + 1)
if source.match("|"):
item_no = _parse(source, state, verbose, nested + 1)
if source.next == "|":
raise source.error("conditional backref with more than two branches")
else:
item_no = None
if not source.match(")"):
raise source.error("missing ), unterminated subpattern",
source.tell() - start)
subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
continue
elif char in FLAGS or char == "-":
# flags
flags = _parse_flags(source, state, char)
if flags is None: # global flags
if not first or subpattern:
import warnings
warnings.warn(
'Flags not at the start of the expression %r%s' % (
source.string[:20], # truncate long regexes
' (truncated)' if len(source.string) > 20 else '',
),
DeprecationWarning, stacklevel=nested + 6
)
if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
raise Verbose
continue
add_flags, del_flags = flags
group = None
else:
raise source.error("unknown extension ?" + char,
len(char) + 1)
# parse group contents
if group is not None:
try:
group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1) from None
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
not (del_flags & SRE_FLAG_VERBOSE))
p = _parse_sub(source, state, sub_verbose, nested + 1)
if not source.match(")"):
raise source.error("missing ), unterminated subpattern",
source.tell() - start)
if group is not None:
state.closegroup(group, p)
subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
elif this == "^":
subpatternappend((AT, AT_BEGINNING))
elif this == "$":
subpatternappend((AT, AT_END))
else:
raise AssertionError("unsupported special character %r" % (char,))
# unpack non-capturing groups
for i in range(len(subpattern))[::-1]:
op, av = subpattern[i]
if op is SUBPATTERN:
group, add_flags, del_flags, p = av
if group is None and not add_flags and not del_flags:
subpattern[i: i+1] = p
return subpattern