in preprocess/WikiExtractor.py [0:0]
def compact(text):
"""Deal with headers, lists, empty sections, residuals of tables.
:param text: convert to HTML.
"""
page = [] # list of paragraph
headers = {} # Headers for unfilled sections
emptySection = False # empty sections are discarded
listLevel = [] # nesting of lists
listCount = [] # count of each list (it should be always in the same length of listLevel)
for line in text.split('\n'):
if not line: # collapse empty lines
# if there is an opening list, close it if we see an empty line
if len(listLevel):
page.append(line)
if options.toHTML:
for c in reversed(listLevel):
page.append(listClose[c])
listLevel = []
listCount = []
emptySection = False
elif page and page[-1]:
page.append('')
continue
# Handle section titles
m = section.match(line)
if m:
title = m.group(2)
lev = len(m.group(1)) # header level
if options.toHTML:
page.append("<h%d>%s</h%d>" % (lev, title, lev))
if title and title[-1] not in '!?':
title += '.' # terminate sentence.
headers[lev] = title
# drop previous headers
for i in list(headers.keys()):
if i > lev:
del headers[i]
emptySection = True
listLevel = []
listCount = []
continue
# Handle page title
elif line.startswith('++'):
title = line[2:-2]
if title:
if title[-1] not in '!?':
title += '.'
page.append(title)
# handle indents
elif line[0] == ':':
# page.append(line.lstrip(':*#;'))
continue
# handle lists
elif line[0] in '*#;:':
i = 0
# c: current level char
# n: next level char
for c, n in zip_longest(listLevel, line, fillvalue=''):
if not n or n not in '*#;:': # shorter or different
if c:
if options.toHTML:
page.append(listClose[c])
listLevel = listLevel[:-1]
listCount = listCount[:-1]
continue
else:
break
# n != ''
if c != n and (not c or (c not in ';:' and n not in ';:')):
if c:
# close level
if options.toHTML:
page.append(listClose[c])
listLevel = listLevel[:-1]
listCount = listCount[:-1]
listLevel += n
listCount.append(0)
if options.toHTML:
page.append(listOpen[n])
i += 1
n = line[i - 1] # last list char
line = line[i:].strip()
if line: # FIXME: n is '"'
if options.keepLists:
if options.keepSections:
# emit open sections
items = sorted(headers.items())
for _, v in items:
page.append("Section::::" + v)
headers.clear()
# use item count for #-lines
listCount[i - 1] += 1
bullet = 'BULLET::::%d. ' % listCount[i - 1] if n == '#' else 'BULLET::::- '
page.append('{0:{1}s}'.format(bullet, len(listLevel)) + line)
elif options.toHTML:
if n not in listItem:
n = '*'
page.append(listItem[n] % line)
elif len(listLevel):
if options.toHTML:
for c in reversed(listLevel):
page.append(listClose[c])
listLevel = []
listCount = []
page.append(line)
# Drop residuals of lists
elif line[0] in '{|' or line[-1] == '}':
continue
# Drop irrelevant lines
elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
continue
elif len(headers):
if options.keepSections:
items = sorted(headers.items())
for i, v in items:
page.append("Section::::" + v)
headers.clear()
page.append(line) # first line
emptySection = False
elif not emptySection:
# Drop preformatted
if line[0] != ' ': # dangerous
page.append(line)
return page