'''
asfgenid
===================================
Generates HeadingIDs, ElementID, and PermaLinks
First find all specified IDs and classes. Assure unique ID and permalink
Next find all headings missing IDs. Assure unique ID and permalink
Generates a Table of Content
'''

# from __future__ import unicode_literals

import sys
import traceback
import re
import unicodedata

from bs4 import BeautifulSoup, Comment

import pelican.contents
import pelican.plugins.signals

'''
Based on
https://github.com/waylan/Python-Markdown/blob/master/markdown/extensions/headerid.py
Which is BSD licensed, but is very much rewritten.
'''

ASF_GENID = {
    'unsafe_tags': True,        # fix script, style, and iframe html that gfm filters as unsafe
    'metadata': True,           # {{ metadata }} inclusion of data in the html.
    'elements': True,	        # {#id} and {.class} annotations.
    'headings': True,	        # add slugified id to headings missing id. Can be overridden by page metadata.
    'headings_re': r'^h[1-6]',  # regex for which headings to check.
    'permalinks': True,	        # add permalinks to elements and headings when id is added.
    'toc': True,  	        # check for [TOC] and add Table of Content if present.
    'toc_headers': r'h[1-6]',   # regex for which headings to include in the [TOC]
    'tables': True,	        # add class="table" for tables missing class.
    'debug': False
}

# Fixup tuples for HTML that GFM makes into text.
FIXUP_UNSAFE = [
    (re.compile(r'&lt;script'), '<script'),
    (re.compile(r'&lt;/script'), '</script'),
    (re.compile(r'&lt;style'), '<style'),
    (re.compile(r'&lt;/style'), '</style'),
    (re.compile(r'&lt;iframe'), '<iframe'),
    (re.compile(r'&lt;/iframe'), '</iframe')
]

# Find {{ metadata }} inclusions
METADATA_RE = re.compile(r'{{\s*(?P<meta>[-_:a-zA-Z0-9]+)\s*}}')

# Find {#id} or {.class} elementid annotations
ELEMENTID_RE = re.compile(r'(?:[ \t]*[{\[][ \t]*(?P<type>[#.])(?P<id>[-._:a-zA-Z0-9 ]+)[}\]])(\n|$)')

# ID duplicates match
IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')

# For permalinks
LINK_CHAR = '¶'

# strip permalink chars from headings for ToC
PARA_MAP = {
    ord(LINK_CHAR): None
}

# Find table tags - to check for ones without class attribute.
TABLE_RE = re.compile(r'^table')


# An item in a Table of Contents - from toc.py
class HtmlTreeNode(object):
    def __init__(self, parent, header, level, id):
        self.children = []
        self.parent = parent
        self.header = header
        self.level = level
        self.id = id

    def add(self, new_header):
        new_level = new_header.name
        new_string = new_header.string
        new_id = new_header.attrs.get('id')

        if not new_string:
            new_string = new_header.find_all(
                text=lambda t: not isinstance(t, Comment),
                recursive=True)
            new_string = ''.join(new_string)
        new_string = new_string.translate(PARA_MAP)

        if self.level < new_level:
            new_node = HtmlTreeNode(self, new_string, new_level, new_id)
            self.children += [new_node]
            return new_node, new_header
        elif self.level == new_level:
            new_node = HtmlTreeNode(self.parent, new_string, new_level, new_id)
            self.parent.children += [new_node]
            return new_node, new_header
        elif self.level > new_level:
            return self.parent.add(new_header)

    def __str__(self):
        ret = ''
        if self.parent:
            ret = "<a class='toc-href' href='#{0}' title='{1}'>{1}</a>".format(
                self.id, self.header)

        if self.children:
            ret += "<ul>{}</ul>".format('{}' * len(self.children)).format(
                *self.children)

        if self.parent:
            ret = "<li>{}</li>".format(ret)

        if not self.parent:
            ret = "<div id='toc'>{}</div>".format(ret)

        return ret


# assure configuration
def init_default_config(pelican):
    from pelican.settings import DEFAULT_CONFIG

    DEFAULT_CONFIG.setdefault('ASF_GENID', ASF_GENID)
    if(pelican):
        pelican.settings.setdefault('ASF_GENID', ASF_GENID)


# from Apache CMS markdown/extensions/headerid.py - slugify in the same way as the Apache CMS
def slugify(value, separator):
    """ Slugify a string, to make it URL friendly. """
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
    value = re.sub('[^\\w\\s-]', '', value.decode('ascii')).strip().lower()
    return re.sub('[%s\\s]+' % separator, separator, value)


# Ensure an id is unique in a set of ids. Append '_1', '_2'... if not
def unique(id, ids):
    while id in ids or not id:
        m = IDCOUNT_RE.match(id)
        print(f'id="{id}" is a duplicate')
        if m:
            id = '%s_%d' % (m.group(1), int(m.group(2)) + 1)
        else:
            id = '%s_%d' % (id, 1)
    ids.add(id)
    return id


# append a permalink
def permalink(soup, mod_element):
    new_tag = soup.new_tag('a', href='#' + mod_element['id'])
    new_tag['class'] = 'headerlink'
    new_tag['title'] = 'Permalink'
    new_tag.string = LINK_CHAR
    mod_element.append(new_tag)


# fixup cmark content - note that this may be too hungry. It may need to occur later and skipped in codeblock and pre tags.
def fixup_content(content):
    text = content._content
    modified = False
    # Find messed up html
    for regex, replace in FIXUP_UNSAFE:
        m = regex.search(text)
        if m:
            modified = True
            text = re.sub(regex, replace, text)
    if modified:
        content._content = text


# expand metadata found in {{ key }}
def expand_metadata(tag, metadata):
    this_string = str(tag.string)
    m = 1
    modified = False
    while m:
        m = METADATA_RE.search(this_string)
        if m:
            this_data = m.group(1).strip()
            format_string = '{{{0}}}'.format(this_data)
            try:
                new_string = format_string.format(**metadata)
                print(f'{{{{{m.group(1)}}}}} -> {new_string}')
            except Exception:
                # the data expression was not found
                print(f'{{{{{m.group(1)}}}}} is not found')
                new_string = format_string
            # replace the first pattern with the new_string
            this_string = re.sub(METADATA_RE, new_string, this_string, count=1)
            modified = True
    if modified:
        tag.string.replace_with(this_string)


# do elementid transformation for {#id} and {.class} attribute annotations.
def elementid_transform(ids, soup, tag, permalinks, perma_set, debug):
    tagnav = tag.parent
    this_string = str(tag.string)
    if debug:
        print(f'name = {tagnav.name}, string = {this_string}')
    if tagnav.name not in ['[document]', 'code', 'pre']:
        m = ELEMENTID_RE.search(tag.string)
        if m:
            # this replacement could be better it truncates and likely drops additional annotations
            tag.string.replace_with(this_string[:m.start()])
            if m.group('type') == '#':
                # id attribute annotation
                tagnav['id'] = unique(m.group('id'), ids)
                if permalinks:
                    permalink(soup, tagnav)
                    unique(tagnav['id'], perma_set)
                if debug:
                    print(f'# insertion {tagnav}')
            else:
                # class attribute annotation (regex only recognizes the two types)
                tagnav['class'] = m.group('id')
                if debug:
                    print(f'Class {tag.name} : {tagnav["class"]}')


# generate id for a heading
def headingid_transform(ids, soup, tag, permalinks, perma_set):
    new_string = tag.string
    if not new_string:
        # roll up strings if no immediate string
        new_string = tag.find_all(
            text=lambda t: not isinstance(t, Comment),
            recursive=True)
        new_string = ''.join(new_string)

    # don't have an id create it from text
    new_id = slugify(new_string, '-')
    tag['id'] = unique(new_id, ids)
    if permalinks:
        permalink(soup, tag)
        # inform if there is a duplicate permalink
        unique(tag['id'], perma_set)


# generate table of contents from headings after [TOC] content
def generate_toc(content, tags, title, toc_headers):
    settoc = False
    tree = node = HtmlTreeNode(None, title, 'h0', '')
    # find the last [TOC]
    taglast = tags[0]
    for tag in tags:
        taglast = tag
    # find all headings after the final [TOC]
    heading_re = re.compile(toc_headers)
    for header in taglast.findAllNext(heading_re):
        # we have heading content for the ToC
        settoc = True
        # add the heading.
        node, _new_header = node.add(header)
    # convert the ToC to Beautiful Soup
    tree_soup = ''
    if settoc:
        print('  ToC')
        # convert the HtmlTreeNode into Beautiful Soup
        tree_string = '{}'.format(tree)
        tree_soup = BeautifulSoup(tree_string, 'html.parser')
        # Make the ToC available to the theme's template
        content.toc = tree_soup.decode(formatter='html')
    # replace the first [TOC] with the generated table of contents
    for tag in tags:
        tag.replaceWith(tree_soup)
        # replace additional [TOC] with nothing
        tree_soup = ''


# create breadcrumb html
def make_breadcrumbs(rel_source_path, title):
    parts = rel_source_path.split('/')
    url = '/'
    crumbs = []
    crumbs.append(f'<a href="/">Home</a>&nbsp;&raquo&nbsp;')
    # don't process the filename part
    last = len(parts)-1
    for i in range(last):
        url = f"{url}{parts[i]}/"
        p = parts[i].capitalize()
        crumbs.append(f'<a href="{url}">{p}</a>&nbsp;&raquo&nbsp;')
    crumbs.append(f'<a href="#">{title}</a>')
    return ''.join(crumbs)
    

# add the asfdata metadata into GFM content.
def add_data(content):
    """ Mix in ASF data as metadata """

    # if the reader is 'asf' then the asf metadata is already in place during asfreader plugin.
    if content.metadata.get('reader') != 'asf':
        asf_metadata = content.settings.get('ASF_DATA', { }).get('metadata')
        if asf_metadata:
            content.metadata.update(asf_metadata)


# main worker transforming the html
def generate_id(content):
    if isinstance(content, pelican.contents.Static):
        return

    # get plugin settings
    asf_genid = content.settings['ASF_GENID']
    # asf_headings setting may be overridden
    asf_headings = content.metadata.get('asf_headings', str(asf_genid['headings']))

    # show active plugins
    if asf_genid['debug']:
        print('asfgenid:\nshow plugins in case one is processing before this one')
        for name in content.settings['PLUGINS']:
            print(f'plugin: {name}')

    # track the id tags
    ids = set()
    # track permalinks
    permalinks = set()

    # step 1 - fixup html that cmark marks unsafe - move to later?
    if asf_genid['unsafe_tags']:
        fixup_content(content)

    # step 2 - prepare for genid processes
    # parse html content into BeautifulSoup4
    soup = BeautifulSoup(content._content, 'html.parser')
    # page title
    title = content.metadata.get('title', 'Title')
    # assure relative source path is in the metadata
    content.metadata['relative_source_path'] = rel_source_path = content.relative_source_path
    # create breadcrumb html
    content.metadata['breadcrumbs'] = breadcrumbs = make_breadcrumbs(rel_source_path, title)
    # display output path and title
    print(f'{content.relative_source_path} - {title}')
    # if debug display breadcrumb html
    if asf_genid['debug']:
        print(f'    {breadcrumbs}')
    # enhance metadata if done by asfreader
    add_data(content)

    # step 3 - metadata expansion
    if asf_genid['metadata']:
        if asf_genid['debug']:
            print(f'metadata expansion: {content.relative_source_path}')
        for tag in soup.findAll(string=METADATA_RE):
            expand_metadata(tag, content.metadata)

    # step 4 - find all id attributes already present
    for tag in soup.findAll(id=True):
        unique(tag['id'], ids)
        # don't change existing ids

    # step 5 - find all {#id} and {.class} text and assign attributes
    if asf_genid['elements']:
        if asf_genid['debug']:
            print(f'elementid: {content.relative_source_path}')
        for tag in soup.findAll(string=ELEMENTID_RE):
            elementid_transform(ids, soup, tag, asf_genid['permalinks'], permalinks, asf_genid['debug'])

    # step 6 - find all headings w/o ids already present or assigned with {#id} text
    if asf_headings == 'True':
        if asf_genid['debug']:
            print(f'headings: {content.relative_source_path}')
        # Find heading tags
        HEADING_RE = re.compile(asf_genid['headings_re'])
        for tag in soup.findAll(HEADING_RE, id=False):
            headingid_transform(ids, soup, tag, asf_genid['permalinks'], permalinks)

    # step 7 - find all tables without class
    if asf_genid['tables']:
        if asf_genid['debug']:
            print(f'tables: {content.relative_source_path}')
        for tag in soup.findAll(TABLE_RE, _class=False):
            tag['class'] = 'table'

    # step 8 - find TOC tag and generate Table of Contents
    if asf_genid['toc']:
        tags = soup('p', text='[TOC]')
        if tags:
            generate_toc(content, tags, title, asf_genid['toc_headers'])

    # step 9 - reset the html content
    content._content = soup.decode(formatter='html')

    # step 10 - output all of the permalinks created
    for tag in permalinks:
        print(f'    #{tag}')


def tb_connect(pel_ob):
    """Print any exception, before Pelican chews it into nothingness."""
    try:
        generate_id(pel_ob)
    except Exception:
        print('-----', file=sys.stderr)
        print('FATAL: %s' % (pel_ob.relative_source_path), file=sys.stderr)
        traceback.print_exc()
        # if we have errors in this module then we want to quit to avoid erasing the site
        sys.exit(4)


def register():
    pelican.plugins.signals.initialized.connect(init_default_config)


pelican.plugins.signals.content_object_init.connect(tb_connect)
