pelican/plugins/asfdata.py (557 lines of code) (raw):

#!/usr/bin/python -B # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # # asfdata.py -- Pelican plugin that processes a yaml specification of data into a setting directory # import os.path import sys import subprocess import datetime import random import json import re import traceback import operator import pprint import requests import yaml import ezt import xml.dom.minidom import xml.parsers.expat import pelican.plugins.signals import pelican.utils from bs4 import BeautifulSoup FIXUP_HTML = [ (re.compile(r'&lt;'), '<'), (re.compile(r'&gt;'), '>'), ] REQUESTS_TIMEOUT = 5 # timeout for requests calls # Format of svn ls -v output: Jan 1 1970 SVN_DATE_FORMAT = "%b %d %Y" # read the asfdata configuration in order to get data load and transformation instructions. def read_config(config_yaml, debug): with pelican.utils.pelican_open(config_yaml) as text: config_data = yaml.safe_load(text) if debug: pp = pprint.PrettyPrinter(indent=2) pp.pprint(config_data) return config_data # load yaml and json data sources. def load_data(path, content, debug): parts = path.split('/') extension = os.path.splitext(parts[-1])[1] # split off ext, keep ext if debug: print(f'Loading {extension} from {path}') if extension == '.json': load = json.loads(content) elif extension == '.yaml': load = yaml.safe_load(content) else: load = { } return load # load data source from a url. def url_data(url, debug): print("url_data",url, debug) return load_data( url, requests.get(url, timeout=REQUESTS_TIMEOUT).text, debug) # load data source from a file. def file_data(rel_path, debug): return load_data( rel_path, open(rel_path, 'r').read(), debug) # remove parts of a data source we don't want ro access def remove_part(reference, part): for refs in reference: if refs == part: del reference[part] return elif isinstance(reference[refs], dict): remove_part(reference[refs], part) # trim out parts of a data source that don't match part = True def where_parts(reference, part): # currently only works on True parts # if we trim as we go we invalidate the iterator. Instead create a deletion list. filtered = [ ] # first find the list that needs to be trimmed. for refs in reference: if not reference[refs][part]: filtered.append(refs) # remove the parts to be trimmed. for refs in filtered: del reference[refs] # perform alphabetation. HTTP Server is special and is put before 'A' def alpha_part(reference, part): for refs in reference: name = reference[refs][part] if name == 'HTTP Server': # when sorting by letter HTTPD Server is wanted first letter = ' ' else: letter = name[0].upper() reference[refs]['letter'] = letter # convert a list singleton into an name and availid (e.g. chair and roster for officer positions) def asfid_part(reference, part): for refs in reference: fix = reference[refs][part] for k in fix: availid = k name = fix[k]['name'] reference[refs][part] = name reference[refs]['availid'] = availid # add logo attribute with HEAD check for existence. If nonexistent use default. def add_logo(reference, part): # split between logo pattern and default. parts = part.split(',') for item in reference: # the logo pattern includes a place to insert the project/podling key logo = (parts[0].format(item.key_id)) # HEAD request try: response = requests.head('https://www.apache.org/' + logo, timeout=REQUESTS_TIMEOUT) if response.status_code != 200: # logo not found - use the default logo logo = parts[1] except requests.exceptions.Timeout: logo = parts[1] # save the logo path as an attribute setattr(item, 'logo', logo) return reference # convert a dictionary into a sequence (list) def sequence_dict(seq, reference): sequence = [ ] for refs in reference: # converting dicts into objects with attrributes. Ignore non-dict content. if isinstance(reference[refs], dict): # put the key of the dict into the dictionary reference[refs]['key_id'] = refs for item in reference[refs]: if isinstance(reference[refs][item], bool): # fixup any boolean values to be ezt.boolean - essentially True -> "yes" reference[refs][item] = ezt.boolean(reference[refs][item]) # convert the dict into an object with attributes and append to the sequence sequence.append(type(seq, (), reference[refs])) return sequence # convert a list into a sequence. convert dictionaries items into objects. def sequence_list(seq, reference): sequence = [ ] for refs in reference: # only convert dicts into objects if isinstance(refs, dict): for item in refs: if isinstance(refs[item], bool): # fixup any boolean values to be ezt.boolean - essentially True -> "yes" refs[item] = ezt.boolean(refs[item]) elif isinstance(refs[item], list): # recursively convert sub-lists refs[item] = sequence_list(item, refs[item]) # convert the dict into an object with attributes and append to the sequence sequence.append(type(f'{seq}', (), refs)) return sequence # split a list into equal sized columns. Adds letter breaks in the alphabetical sequence. def split_list(metadata, seq, reference, split): # copy sequence sequence = list(reference) # sort the copy sequence.sort(key=lambda x: (x.letter, x.display_name)) # size of list size = len(sequence) # size of columns percol = int((size + 26 + split - 1) / split) # positions start = nseq = nrow = 0 letter = ' ' # create each column for column in range(split): subsequence = [ ] end = min(size + 26, start + percol) while nrow < end: if letter < sequence[nseq].letter: # new letter - add a letter break into the column. If a letter has no content it is skipped letter = sequence[nseq].letter subsequence.append(type(seq, (), { 'letter': letter, 'display_name': letter})) else: # add the project into the sequence subsequence.append(sequence[nseq]) nseq = nseq + 1 nrow = nrow + 1 # save the column sequence in the metadata metadata[f'{seq}_{column}'] = subsequence start = end if nseq < size: print(f'WARNING: {seq} not all of sequence consumed: short {size-nseq} projects') # process sequencing transformations to the data source def process_sequence(metadata, seq, sequence, load, debug): reference = load # has been converted to a sequence is_sequence = False # has been converted to a dictionary - won't be made into a sequence is_dictionary = False # save metadata at the end save_metadata = True # description if debug and 'description' in sequence: print(f'{seq}: {sequence["description"]}') # select sub dictionary if 'path' in sequence: if debug: print(f'path: {sequence["path"]}') parts = sequence['path'].split('.') for part in parts: reference = reference[part] # filter dictionary by attribute value. if filter is false discard if 'where' in sequence: if debug: print(f'where: {sequence["where"]}') where_parts(reference, sequence['where']) # remove irrelevant keys if 'trim' in sequence: if debug: print(f'trim: {sequence["trim"]}') parts = sequence['trim'].split(',') for part in parts: remove_part(reference, part) # transform roster and chair patterns if 'asfid' in sequence: if debug: print(f'asfid: {sequence["asfid"]}') asfid_part(reference, sequence['asfid']) # add first letter ofr alphabetic categories if 'alpha' in sequence: if debug: print(f'alpha: {sequence["alpha"]}') alpha_part(reference, sequence['alpha']) # this dictionary is derived from sub-dictionaries if 'dictionary' in sequence: if debug: print(f'dictionary: {sequence["dictionary"]}') reference = { } paths = sequence['dictionary'].split(',') # create a dictionary from the keys in one or more sub-dictionaries for path in paths: for key in load[path]: reference[key] = load[path][key] # dictionary result, do not sequence is_dictionary = True # this sequence is derived from another sequence if 'sequence' in sequence: if debug: print(f'sequence: {sequence["sequence"]}') reference = metadata[sequence['sequence']] # sequences derived from prior sequences do not need to be converted to a sequence is_sequence = True # this sequence is a random sample of another sequence if 'random' in sequence: if debug: print(f'random: {sequence["random"]}') if is_sequence: reference = random.sample(reference, sequence['random']) else: print(f'{seq} - random requires an existing sequence to sample') # for a project or podling see if the logo exists w/HEAD and set the relative path. if 'logo' in sequence: if debug: print(f'logo: {sequence["logo"]}') if is_sequence: # determine the project or podling logo reference = add_logo(reference, sequence['logo']) if seq == 'featured_pods': # for podlings strip "Apache" from the beginning and "(incubating)" from the end. # this is Sally's request for item in reference: setattr(item, 'name', ' '.join(item.name.split(' ')[1:-1])) else: print(f'{seq} - logo requires an existing sequence') # this sequence is a sorted list divided into multiple columns if 'split' in sequence: if debug: print(f'split: {sequence["split"]}') if is_sequence: # create a sequence for each column split_list(metadata, seq, reference, sequence['split']) # created column sequences are already saved to metadata so do not do so later save_metadata = False else: print(f'{seq} - split requires an existing sequence to split') if 'truncate' in sequence: multiple = int(sequence["truncate"]) reference = int(reference / multiple) * multiple # if this not already a sequence or dictionary then convert to a sequence if not is_sequence and not is_dictionary: # convert the dictionary/list to a sequence of objects if debug: print(f'{seq}: create sequence') if isinstance(reference, dict): reference = sequence_dict(seq, reference) elif isinstance(reference, list): reference = sequence_list(seq, reference) # save sequence in metadata if save_metadata: metadata[seq] = reference try: metadata[f'{seq}_size'] = len(reference) except TypeError: # allow for integer pass # create metadata sequences and dictionaries from a data load def process_load(metadata, value, load, debug): for seq in value: if seq not in ('url', 'file'): # one or more sequences sequence = value[seq] process_sequence(metadata, seq, sequence, load, debug) # convert byte count to human-readable (1k 2m 3g etc) def bytesto(bytecount, to, bsize=1024): a = {'k': 1, 'm': 2, 'g': 3, 't': 4, 'p': 5, 'e': 6} r = float(bytecount) return r / (bsize ** a[to]) # open a subprocess def os_popen(args): return subprocess.Popen(args, stdout=subprocess.PIPE, universal_newlines=True) # retrieve the release distributions for a project from svn def process_distributions(project, src, sort_revision, debug): if debug: print(f'releases: {project}') # current date information will help process svn ls results gatherDate = datetime.datetime.utcnow() gatherYear = gatherDate.year # information to accumulate signatures = {} checksums = {} fsizes = {} dtms = {} versions = {} revisions = {} keys = None # ensure defined before use # read the output from svn ls -Rv url = f'https://dist.apache.org/repos/dist/release/{project}' if debug: print(f'releases: {url}') with os_popen(['svn', 'ls', '-Rv', url]) as s: for line in s.stdout: line = line.strip() listing = line.split(' ') if line[-1:] == '/': # skip directories continue if sort_revision: revision = int(listing[0]) else: revision = 0 # user = listing[1] if listing[-6] == '': # dtm in the past year dtm1 = datetime.datetime.strptime(" ".join(listing[-4:-2]) + " " + str(gatherYear), SVN_DATE_FORMAT) if dtm1 > gatherDate: dtm1 = datetime.datetime.strptime(" ".join(listing[-4:-2]) + " " + str(gatherYear - 1), SVN_DATE_FORMAT) fsize = listing[-5] else: # dtm older than one year dtm1 = datetime.datetime.strptime(" ".join(listing[-5:-1]), SVN_DATE_FORMAT) fsize = listing[-6] # date is close enough dtm = dtm1.strftime("%m/%d/%Y") # covert to number of MB if float(fsize) > 524288: fsize = ('%.2f' % bytesto(fsize, 'm')) + ' MB' else: fsize = ('%.2f' % bytesto(fsize, 'k')) + ' KB' # line is path line = listing[-1] # fields are parts of the path fields = line.split('/') # filename os the final part filename = fields[-1] # parts includes the whole path parts = line.split('.') # use the path as a key for each release release = line if filename: if re.search('KEYS(\\.txt)?$', filename): # save the KEYS file url keys = f'https://downloads.apache.org/{project}/{line}' elif re.search('\\.(asc|sig)$', filename, flags=re.IGNORECASE): # we key a release off of a signature. remove the extension release = '.'.join(parts[:-1]) signatures[release] = filename # the path to the signature is used as the version versions[release] = '/'.join(fields[:-1]) # we use the revision for sorting revisions[release] = revision if re.search(src, filename): # put source distributions in the front (it is a reverse sort) revisions[release] = revision + 100000 elif re.search('\\.(sha512|sha1|sha256|sha|md5|mds)$', filename, flags=re.IGNORECASE): # some projects checksum their signatures part0 = ".".join(line.split('.')[-2:-1]) if part0 == "asc": # skip files that are hashes of signatures continue # strip the extension to get the release name release = '.'.join(parts[:-1]) checksums[release] = filename else: # for the released file save the size and dtm fsizes[release] = fsize dtms[release] = dtm # separate versions. each_version = {} for rel in signatures: version = versions[rel] if version not in each_version: each_version[version] = [] release = rel[len(version) + 1:] try: each_version[version].append( Distribution(release=release, revision=revisions[rel], signature=signatures[rel], checksum=checksums[rel], dtm=dtms[rel], fsize=fsizes[rel])) except Exception: traceback.print_exc() distributions = [] for version in each_version: each_version[version].sort(key=lambda x: (-x.revision, x.release)) distributions.append( Version(version=version, name=' '.join(version.split('/')), revision=each_version[version][0].revision, release=each_version[version])) distributions.sort(key=lambda x: (-x.revision, x.version)) return keys, distributions # get xml text node def get_node_text(nodelist): """http://www.python.org/doc/2.5.2/lib/minidom-example.txt""" rc = '' for node in nodelist: if node.nodeType in [node.CDATA_SECTION_NODE, node.TEXT_NODE]: rc = rc + node.data return rc # get xml element's text nodes. def get_element_text(entry, child): elements = entry.getElementsByTagName(child) return get_node_text(elements[0].childNodes) # retrieve truncate words in html. def truncate_words(text, words): content_text = ' '.join(text.split(' ')[:words]) + "..." for regex, replace in FIXUP_HTML: m = regex.search(content_text) if m: content_text = re.sub(regex, replace, content_text) tree_soup = BeautifulSoup(content_text, 'html.parser') content_text = tree_soup.prettify() return content_text # retrieve blog posts from an Atom feed. def process_blog(feed, count, words, debug): if debug: print(f'blog feed: {feed}') # See INFRA-23636: cannot check the page status, so just catch parsing errors try: content = requests.get(feed, timeout=REQUESTS_TIMEOUT).text dom = xml.dom.minidom.parseString(content) # dive into the dom to get 'entry' elements entries = dom.getElementsByTagName('entry') # we only want count many from the beginning entries = entries[:count] except xml.parsers.expat.ExpatError: entries = [] except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): entries = [] v = [ ] for entry in entries: if debug: print(entry.tagName) # we may want content content_text = '' if words: content_text = truncate_words(get_element_text(entry, 'content'), words) # we want the title and href v.append( { 'id': get_element_text(entry, 'id'), 'title': get_element_text(entry, 'title'), 'content': content_text } ) if debug: for s in v: print(s) return [ Blog(href=s['id'], title=s['title'], content=s['content']) for s in v] # to be updated from hidden location. (Need to discuss local.) def twitter_auth(): authtokens = os.path.join(os.path.expanduser('~'), '.authtokens') try: for line in open(authtokens).readlines(): if line.startswith('twitter:'): token = line.strip().split(':')[1] # do not print or display token as it is a secret return token except Exception: traceback.print_exc() return None # retrieve from twitter def connect_to_endpoint(url, headers): response = requests.request('GET', url, headers=headers, timeout=REQUESTS_TIMEOUT) if response.status_code != 200: # TODO: choose better exception raise Exception(response.status_code, response.text) # pylint: disable=broad-exception-raised return response.json() # retrieve the last count recent tweets from the handle. def process_twitter(handle, count, debug): if debug: print(f'-----\ntwitter feed: {handle}') bearer_token = twitter_auth() if not bearer_token: print('WARN: no bearer token for Twitter') return sequence_list('twitter',[{ 'text': 'To retrieve tweets supply a valid twitter bearer token in ~/.authtokens' }]) # do not print or display bearer_token as it is a secret query = f'from:{handle}' tweet_fields = 'tweet.fields=author_id' url = f'https://api.twitter.com/2/tweets/search/recent?query={query}&{tweet_fields}' headers = {'Authorization': f'Bearer {bearer_token}'} try: load = connect_to_endpoint(url, headers) except Exception as e: print(f'ERROR: Cannot connect to Twitter for {handle}: {e}') return sequence_list('twitter',[{ 'text': 'Cannot connect to Twitter at present' }]) result_count = load['meta']['result_count'] if result_count == 0: print(f'WARN: No recent tweets for {handle}') return sequence_list('twitter',[{ 'text': 'No recent tweets found' }]) if 'data' not in load: print('WARN: "data" not in Twitter response') print(load) # DEBUG; should not happen if result_count > 0 return sequence_list('twitter',[{ 'text': 'Unable to extract Twitter data' }]) reference = sequence_list('twitter', load['data']) if result_count < count: v = reference else: v = reference[:count] return v # create sequence of sequences of ASF ECCN data. def process_eccn(fname, debug): if debug: print('-----\nECCN:', fname) if fname.startswith("https://"): j = yaml.safe_load(requests.get(fname, timeout=REQUESTS_TIMEOUT).text) else: j = yaml.safe_load(open(fname)) # versions have zero or more controlled sources def make_sources(sources): return [ Source(href=s['href'], manufacturer=s['manufacturer'], why=s['why']) for s in sources] # products have one or more versions def make_versions(vsns): return [ Version(version=v['version'], eccn=v['eccn'], source=make_sources(v.get('source', [ ])), ) for v in sorted(vsns, key=operator.itemgetter('version'))] # projects have one or more products def make_products(prods): return [ Product(name=p['name'], versions=make_versions(p['versions']), ) for p in sorted(prods, key=operator.itemgetter('name'))] # eccn matrix has one or more projects return [ Project(name=proj['name'], href=proj['href'], contact=proj['contact'], product=make_products(proj['product'])) for proj in sorted(j['eccnmatrix'], key=operator.itemgetter('name'))] # object wrappers class wrapper: def __init__(self, **kw): vars(self).update(kw) # Improve the names when failures occur. class Source(wrapper): pass class Version(wrapper): pass class Product(wrapper): pass class Project(wrapper): pass class Blog(wrapper): pass class Distribution(wrapper): pass # create metadata according to instructions. def config_read_data(pel_ob): print('-----\nasfdata') asf_data = pel_ob.settings.get('ASF_DATA') if not asf_data: print('This Pelican installation is not using ASF_DATA') return debug = asf_data['debug'] if debug: for key in asf_data: print(f'config: [{key}] = {asf_data[key]}') # This must be present in ASF_DATA. It contains data for use # by our plugins, and possibly where we load/inject data from # other sources. metadata = asf_data['metadata'] # Lift data from ASF_DATA['data'] into METADATA if 'data' in asf_data: if debug: print(f'Processing {asf_data["data"]}') config_data = read_config(asf_data['data'], debug) for key in config_data: # first check for data that is a singleton with special handling if key == 'eccn': # process eccn data fname = config_data[key]['file'] metadata[key] = v = process_eccn(fname, debug) if debug: print('ECCN V:', v) continue if key == 'twitter': # process twitter data # if we decide to have multiple twitter feeds available then move next to blog below handle = config_data[key]['handle'] count = config_data[key]['count'] metadata[key] = v = process_twitter(handle, count, debug) if debug: print('TWITTER V:', v) continue value = config_data[key] if isinstance(value, dict): # dictionaries may have multiple data structures that are processed with a sequence of actions # into multiple sequences and dictionaries. if debug: print(f'-----\n{key} creates one or more sequences') print(value) # special cases that are multiple are processed first if 'blog' in value: # process blog feed feed = config_data[key]['blog'] count = config_data[key]['count'] if 'content' in config_data[key].keys(): words = config_data[key]['content'] else: words = None metadata[key] = v = process_blog(feed, count, words, debug) if debug: print('BLOG V:', v) continue elif 'release' in value: # retrieve active release distributions src = config_data[key]['src'] revision = config_data[key]['revision'] project = config_data[key]['release'] keys, distributions = process_distributions(project, src, revision, debug) metadata[key] = v = distributions metadata[f"{key}-keys"] = keys metadata[f"{key}-project"] = project if debug: print('RELEASE V:', v) elif 'url' in value: # process a url based data source load = url_data(value['url'], debug) process_load(metadata, value, load, debug) elif 'file' in value: # process a file from within the site tree load = file_data(value['file'], debug) process_load(metadata, value, load, debug) else: # should probably be an error but doesn't matter metadata[key] = value else: # simple metadata values - either an int or str if debug: print(f'{key} = {value}') metadata[key] = value # display asfdata metadata or metadata type print('-----') for key in metadata: if debug: print(f'metadata[{key}] =') pp = pprint.PrettyPrinter(indent=2) pp.pprint(metadata[key]) print('-----') elif isinstance(metadata[key], str): print(f'metadata[{key}] = "{metadata[key]}"') elif isinstance(metadata[key], int): print(f'metadata[{key}] = {metadata[key]}') elif isinstance(metadata[key], list): print(f'metadata[{key}] is a sequence.') elif isinstance(metadata[key], dict): print(f'metadata[{key}] is a dictionary.') else: keytype = type(metadata[key]) print(f'metadata[{key}] is a {keytype}') print('-----') def tb_initialized(pel_ob): """ Print any exception, before Pelican chews it into nothingness.""" try: config_read_data(pel_ob) except Exception: print('-----', file=sys.stderr) traceback.print_exc() # exceptions here stop the build raise def register(): # Hook the "initialized" signal, to load our custom data. pelican.plugins.signals.initialized.connect(tb_initialized)