in preprocess/WikiExtractor.py [0:0]
def findMatchingBraces(text, ldelim=0):
"""
:param ldelim: number of braces to match. 0 means match [[]], {{}} and {{{}}}.
"""
# Parsing is done with respect to pairs of double braces {{..}} delimiting
# a template, and pairs of triple braces {{{..}}} delimiting a tplarg.
# If double opening braces are followed by triple closing braces or
# conversely, this is taken as delimiting a template, with one left-over
# brace outside it, taken as plain text. For any pattern of braces this
# defines a set of templates and tplargs such that any two are either
# separate or nested (not overlapping).
# Unmatched double rectangular closing brackets can be in a template or
# tplarg, but unmatched double rectangular opening brackets cannot.
# Unmatched double or triple closing braces inside a pair of
# double rectangular brackets are treated as plain text.
# Other formulation: in ambiguity between template or tplarg on one hand,
# and a link on the other hand, the structure with the rightmost opening
# takes precedence, even if this is the opening of a link without any
# closing, so not producing an actual link.
# In the case of more than three opening braces the last three are assumed
# to belong to a tplarg, unless there is no matching triple of closing
# braces, in which case the last two opening braces are are assumed to
# belong to a template.
# We must skip individual { like in:
# {{#ifeq: {{padleft:|1|}} | { | | }}
# We must resolve ambiguities like this:
# {{{{ }}}} -> { {{{ }}} }
# {{{{{ }}}}} -> {{ {{{ }}} }}
# {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|...}}
# {{{!}} {{!}}}
# Handle:
# {{{{{|safesubst:}}}#Invoke:String|replace|{{{1|{{{{{|safesubst:}}}PAGENAME}}}}}|%s+%([^%(]-%)$||plain=false}}
# as well as expressions with stray }:
# {{{link|{{ucfirst:{{{1}}}}}} interchange}}}
if ldelim: # 2-3
reOpen = re.compile('[{]{%d,}' % ldelim) # at least ldelim
reNext = re.compile('[{]{2,}|}{2,}') # at least 2
else:
reOpen = re.compile('{{2,}|\[{2,}')
reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}') # at least 2
cur = 0
while True:
m1 = reOpen.search(text, cur)
if not m1:
return
lmatch = m1.end() - m1.start()
if m1.group()[0] == '{':
stack = [lmatch] # stack of opening braces lengths
else:
stack = [-lmatch] # negative means [
end = m1.end()
while True:
m2 = reNext.search(text, end)
if not m2:
return # unbalanced
end = m2.end()
brac = m2.group()[0]
lmatch = m2.end() - m2.start()
if brac == '{':
stack.append(lmatch)
elif brac == '}':
while stack:
openCount = stack.pop() # opening span
if openCount == 0: # illegal unmatched [[
continue
if lmatch >= openCount:
lmatch -= openCount
if lmatch <= 1: # either close or stray }
break
else:
# put back unmatched
stack.append(openCount - lmatch)
break
if not stack:
yield m1.start(), end - lmatch
cur = end
break
elif len(stack) == 1 and 0 < stack[0] < ldelim:
# ambiguous {{{{{ }}} }}
#yield m1.start() + stack[0], end
cur = end
break
elif brac == '[': # [[
stack.append(-lmatch)
else: # ]]
while stack and stack[-1] < 0: # matching [[
openCount = -stack.pop()
if lmatch >= openCount:
lmatch -= openCount
if lmatch <= 1: # either close or stray ]
break
else:
# put back unmatched (negative)
stack.append(lmatch - openCount)
break
if not stack:
yield m1.start(), end - lmatch
cur = end
break
# unmatched ]] are discarded
cur = end