"""
Parse committee data to generate JSON files

N.B. Must be run from scripts/cronjobs directory

Reads:
../../data/committees.xml
committee-info.txt from Whimsy

Updates:
../../site/json/foundation/committees.json
../../site/json/foundation/committees-retired.json

"""

import errtee
import re
import json
import sys
if sys.hexversion < 0x03000000:
    raise ImportError("This script requires Python 3")
import io
import os
import os.path
import xml.etree.ElementTree as ET
import xml.dom.minidom as minidom
import datetime
import sendmail

from urlutils import URLopen, URLexists

sys.path.append("..") # module committee_info is in parent directory
import committee_info

# LDAP group ids not matching committee id; convert group to committeeId
group_ids = {
    'ws': 'webservices'
}

# homepages not matching http://<committee id>.apache.org/ and not defined in committee-info.json / index.html
homepages = {
    'comdev': 'http://community.apache.org/', # temporary (accidentally used https: in site.rb local table)
    'whimsy': 'http://whimsical.apache.org/', # incorrect in index.html because actual site does not yet exist
}

# Print to log and send an email (intended for WARN messages)
def printMail(msg,body=''):
    print(msg)
    try:
        sendmail.sendMail(msg,body)
    except ConnectionRefusedError:
        print("*** Failed to send the email")

# compress a string: trim it and replace multiple whitespace with a single space
def compress(s):
    return re.sub(r"""\s+""", ' ', s.strip())

def handleChild(el):
    retval = None
    hasKids = False
    for child in list(el):
        hasKids = True
    attribs = {}
    for key in el.attrib:
        xkey = re.sub(r"\{.+\}", "", key)
        attribs[xkey] = el.attrib[key]
    tag = re.sub(r"\{.+\}", "", el.tag)
    value = attribs['resource'] if 'resource' in attribs else el.text
    if not hasKids:
        retval = value
    else:
        retval = {}
        for child in list(el):
            k, v = handleChild(child)
            retval[k] = v
    return tag, retval

pmcs = {}
pmcDataUrls = {} # id -> url

skipImageTest = len(sys.argv) >= 2 and sys.argv[1] == '--skipImageTest' # speeds up testing considerably

# get PMC Data from /data/committees.xml
print("Reading PMC Data (/data/committees.xml)")
with open("../../data/committees.xml", "r") as f:
    xmldoc = minidom.parseString(f.read())
    f.close()

print("Extracting PMC DOAP file data for json/foundation/committees.json")
for loc in xmldoc.getElementsByTagName('location') :
    url = loc.childNodes[0].data
    try:
        if url.startswith('http'):
            rdf = URLopen(url).read()
        else:
            with open("../../data/%s" % url, 'r', encoding='utf-8') as f:
                rdf = f.read()
            url = "https://svn.apache.org/repos/asf/comdev/projects.apache.org/trunk/data/%s" % url
        rdfxml = ET.fromstring(rdf)
        rdfdata = rdfxml[0]
        expected = '{http://projects.apache.org/ns/asfext#}pmc'
        if not rdfdata.tag == expected:
            print("ERROR: unexpected tag value '%s' in '%s' (expecting %s)" % (rdfdata.tag, url, expected), file=sys.stderr)
            continue # No point proceeding further
        committeeId = rdfdata.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about']
        if re.match("https?:", committeeId):
            print("ERROR: unexpected rdf:about value '%s' in '%s'" % (committeeId, url), file=sys.stderr)
            continue # No point proceeding further
        pmcDataUrls[committeeId] = url

        # transform PMC data RDF to json
        pmcjson = {
            'rdf': url
        }
        pmcname = None
        for el in rdfdata:
            k, v = handleChild(el)
            if k in pmcjson:
                # merge multiple values
                if type(pmcjson[k]) is str:
                    pmcjson[k] = "%s, %s" % (pmcjson[k], v)
                else:
                    for xk in v:
                        pmcjson[k][xk] = v[xk]
            else:
                pmcjson[k] = v

        pmcs[committeeId] = pmcjson

    except Exception as err:
        printMail("ERROR: %s processing %s" % (err, url))

committeeCount = 0
committeesList = []
committeesMap = {}
addedCommittees = [] # new committees detected
rebootedCommittees = [] # retired committees restarted

# temporary fix to ensure comparisons of generated files work better
# The original code relied on the order in the physical file
def keyorder(s):
#     print("key=%s" % s)
    if s == 'apr':
        return 'portableruntime'
    if s == 'climate':
        return 'openclimate'
    if s == 'comdev':
        return 'communitydevelopment'
    if s == 'httpd':
        return 'http' # so it sorts before HTTP Components (it's wrong in CI)
    if s == 'ws':
        return 'webservices'
    return s

# extract committees composition
print("Reading committee-info")
committees = committee_info.committees()

print("Writing generated doap/<committeeId>/pmc.rdf...")
for group in sorted(committees, key=keyorder):

#     if group == 'apr' or group == 'whimsy':
#         print("DEBUG: see what happens when CI entry %s is removed" % group)
#         continue
#     print(group)
    ctte = committees[group]
    fullName = ctte['fullname'] # Full name including Apache prefix
    if ctte['pmc']: # we only want PMCs
        if ctte['established']: # only want ones with entries in section 3
            # Fix up name where PMC RDF does not agree with LDAP group
            if group in group_ids:
                committeeId = group_ids[group]
            else:
                committeeId = group

            img = "http://www.apache.org/logos/res/%s/default.png" % committeeId
            if not skipImageTest and not URLexists(img):
                print("WARN: could not find logo: %s" % (img))
                
            committeeCount += 1
            committee={}
            committee['id'] = committeeId
            try:
                committee['chair'] = ctte['chair']['nick']
            except TypeError: # no chair present
                committee['chair'] = ''
            try:
                committee['reporting'] = ctte['reporting']
            except KeyError:
                pass
            committee['group'] = group
            committee['name'] = fullName
            committee['established'] = ctte['established']
            committee['roster'] = ctte['roster']
            homepage = None
            if group in homepages:
                homepage = homepages[group]
            else:
                if ctte['site']:
                    homepage = ctte['site']
                else:
                    homepage = 'http://%s.apache.org/' % group
            committee['homepage'] = homepage

            if ctte['description']:
                committee['shortdesc'] = ctte['description']
            else:
                # N.B. Whimsy parses index.html to generate the description entry in committee-info.json
                printMail("WARN: %s (%s) missing from http://www.apache.org/index.html#projects-list" % (group, fullName))

            if committeeId in pmcDataUrls:
                committee['rdf'] = pmcDataUrls[committeeId]
            else:
                printMail("WARN: %s (%s) missing from /data/committees.xml" % (fullName, committeeId))
                if os.path.isfile("../../data/committees/%s.rdf" % committeeId):
                    print("INFO: %s.rdf exists in data/committees/ but is not in /data/committees.xml" % committeeId)

            if committeeId in pmcs:
                if 'charter' in pmcs[committeeId]:
                    committee['charter'] = compress(pmcs[committeeId]['charter'])

            committeesList.append(committee)
            committeesMap[committeeId] = committee;
        else:
            print("INFO: %s ignored - not yet in section 3" % fullName)
    else:
        # Special Committee (Officer's, President's or Board)
        print("INFO: %s ignored - not a PMC" % fullName)


# detect retired committees to add to committees-retired.json
with open("../../site/json/foundation/committees-retired.json", "r") as f:
    committeesRetired = json.loads(f.read())
    f.close()
committeesRetiredIds = [item['id'] for item in committeesRetired]

with open("../../site/json/foundation/committees.json", "r") as f:
    committeesPrevious = json.loads(f.read())
    f.close()
committeesPreviousIds = [item['id'] for item in committeesPrevious]

for currId in committeesMap:
    if currId not in committeesPreviousIds:
        addedCommittees.append(currId)
    if currId in committeesRetiredIds: # detect rebooted committees
      rebootedCommittees.append(currId)

print("found %s new committees from %s committees in committee_info.txt" % (len(addedCommittees), committeeCount))
addedCommittees.sort()
for added in addedCommittees:
    print("- %s" % added)

print("found %s rebooted committees from %s committees in committee_info.txt" % (len(rebootedCommittees), committeeCount))
rebootedCommittees.sort()
for added in rebootedCommittees:
    print("- %s" % added)

# Drop the rebooted committees from the retired list
committeesRetired = [x for x in committeesRetired if x['id'] not in rebootedCommittees]

for previous in committeesPrevious:
    prevId = previous['id']
    if prevId not in committeesMap:
        print("found retired committee: %s %s" % (prevId, previous['name']))
        previous['retired'] = datetime.date.today().strftime('%Y-%m')
        # remove data that is not useful in a retired committee
        previous.pop('chair', None)
        previous.pop('group', None)
        previous.pop('rdf', None)
        previous.pop('reporting', None)
        committeesRetired.append(previous)

print("Writing json/foundation/committees.json...")
with open("../../site/json/foundation/committees.json", "w") as f:
    json.dump(committeesList, f, sort_keys=True, indent=0)
    f.close()

print("Writing json/foundation/committees-retired.json...")
with open("../../site/json/foundation/committees-retired.json", "w") as f:
    json.dump(committeesRetired, f, sort_keys=True, indent=0)
    f.close()

print("All done")