def compact()

in preprocess/WikiExtractor.py [0:0]


def compact(text):
    """Deal with headers, lists, empty sections, residuals of tables.
    :param text: convert to HTML.
    """

    page = []             # list of paragraph
    headers = {}          # Headers for unfilled sections
    emptySection = False  # empty sections are discarded
    listLevel = []        # nesting of lists
    listCount = []        # count of each list (it should be always in the same length of listLevel)
    for line in text.split('\n'):
        if not line:            # collapse empty lines
            # if there is an opening list, close it if we see an empty line
            if len(listLevel):
                page.append(line)
                if options.toHTML:
                    for c in reversed(listLevel):
                        page.append(listClose[c])
                listLevel = []
                listCount = []
                emptySection = False
            elif page and page[-1]:
                page.append('')
            continue
        # Handle section titles
        m = section.match(line)
        if m:
            title = m.group(2)
            lev = len(m.group(1)) # header level
            if options.toHTML:
                page.append("<h%d>%s</h%d>" % (lev, title, lev))
            if title and title[-1] not in '!?':
                title += '.'    # terminate sentence.
            headers[lev] = title
            # drop previous headers
            for i in list(headers.keys()):
                if i > lev:
                    del headers[i]
            emptySection = True
            listLevel = []
            listCount = []
            continue
        # Handle page title
        elif line.startswith('++'):
            title = line[2:-2]
            if title:
                if title[-1] not in '!?':
                    title += '.'
                page.append(title)
        # handle indents
        elif line[0] == ':':
            # page.append(line.lstrip(':*#;'))
            continue
        # handle lists
        elif line[0] in '*#;:':
            i = 0
            # c: current level char
            # n: next level char
            for c, n in zip_longest(listLevel, line, fillvalue=''):
                if not n or n not in '*#;:': # shorter or different
                    if c:
                        if options.toHTML:
                            page.append(listClose[c])
                        listLevel = listLevel[:-1]
                        listCount = listCount[:-1]
                        continue
                    else:
                        break
                # n != ''
                if c != n and (not c or (c not in ';:' and n not in ';:')):
                    if c:
                        # close level
                        if options.toHTML:
                            page.append(listClose[c])
                        listLevel = listLevel[:-1]
                        listCount = listCount[:-1]
                    listLevel += n
                    listCount.append(0)
                    if options.toHTML:
                        page.append(listOpen[n])
                i += 1
            n = line[i - 1]  # last list char
            line = line[i:].strip()
            if line:  # FIXME: n is '"'
                if options.keepLists:
                    if options.keepSections:
                        # emit open sections
                        items = sorted(headers.items())
                        for _, v in items:
                            page.append("Section::::" + v)
                    headers.clear()
                    # use item count for #-lines
                    listCount[i - 1] += 1
                    bullet = 'BULLET::::%d. ' % listCount[i - 1] if n == '#' else 'BULLET::::- '
                    page.append('{0:{1}s}'.format(bullet, len(listLevel)) + line)
                elif options.toHTML:
                    if n not in listItem: 
                        n = '*'
                    page.append(listItem[n] % line)
        elif len(listLevel):
            if options.toHTML:
                for c in reversed(listLevel):
                    page.append(listClose[c])
            listLevel = []
            listCount = []
            page.append(line)

        # Drop residuals of lists
        elif line[0] in '{|' or line[-1] == '}':
            continue
        # Drop irrelevant lines
        elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
            continue
        elif len(headers):
            if options.keepSections:
                items = sorted(headers.items())
                for i, v in items:
                    page.append("Section::::" + v)
            headers.clear()
            page.append(line)  # first line
            emptySection = False
        elif not emptySection:
            # Drop preformatted
            if line[0] != ' ':  # dangerous
                page.append(line)
    return page