scripts/cronjobs/parsereleases.py

#!/usr/bin/env python3 import errtee # pylint: disable=unused-import from collections import defaultdict import gzip import json from os.path import dirname, join, realpath from urlutils import UrlCache """ Reads the list of files in https://downloads.apache.org/zzz/find-ls.gz Creates: ../../site/json/foundation/releases.json Format: { top-level dir: { release-id: date}, ... } The release id is derived from the filename by removing common suffixes etc, see cleanFilename() The date comes from the first entry ../../site/json/foundation/releases-files.json Format: { top-level dir: { release-id: [list of files for that release-id]}, ... } """ # Listing generated by find(1) -ls FIND_LS = 'https://downloads.apache.org/zzz/find-ls2.txt.gz' # key: committee-id, value: dict(key: release version, value: date) releases = defaultdict(dict) # key: committee-id, value: dict(key: release version, value: list of file names for the release) files = defaultdict(lambda: defaultdict(list)) def cleanFilename(filename): """ Attempts to determine the release id to which a file belongs Strips extensions such as .tgz etc, then suffixes such as -sources Replaces qualifiers such as -assembly-, -parent- by '-' Returns the simplified filename . N.B. the ordering is significant, as the list is only scanned once """ for suffix in ['-all', '-src', '_src', '.src', '-sources', '_sources', '-source', '-bin', '-dist', '-source-release', '-source-relase', '-tests', '-test', '-debug', '-uber', '-macosx', '-distribution', '-example', '-native', '-win', '-win32', '-linux', '-pack', '-packaged', '-current', '-embedded', '-py', '-py2', '-py2.6', '-py2.7', '-no', 'unix-distro', 'windows-distro', 'with', '-dep', '-standalone', '-webapp', '-dom', '-om', '-32bit', '-i386', '_i386', '.i386', '-minimal', '-jettyconfig', '-py2.py3-none-any', 'newkey', 'oldkey', 'jars', '-jre13', '-hadoop1', '-hadoop2', '-project', '-with-dependencies', '-client', '-server', 'server-webapps', '-full', '-all', '-standard', '-for-javaee', '-for-tomcat', 'hadoop1-scala2', '-deployer', '-fulldocs', '-embed', '-apps', '-app', '-ref', '-installer', '-bundle', '-java']: # The above list could be simplified further if filename.endswith(suffix): filename = filename[0:len(filename)-len(suffix)] for repl in ['-assembly-', '-minimal-', '-doc-', '-src-', '-webapp-', '-standalone-', '-parent-', '-project-', '-win32-']: filename = filename.replace(repl, '-') return filename def cleanReleases(committeeId): if len(releases[committeeId]) == 0: del releases[committeeId] del files[committeeId] # all source releases must be one of these VALID_TYPES = ['tgz', 'gz', 'zip', 'xz', 'bz2'] # for gz, xz and bz2, the next extension must be tar TAR_TYPES = ['gz', 'xz', 'bz2'] # file name stems that finish with these strings are not source archives: NON_SOURCE_ENDS = ['-amd64', '-aarch64', '-arm64', '.bin', '-bin', '-binary', '-deps', '-docs', '-javadoc', '-doc', '-lib', '-lib-debug', '-manual', '-site', '-x64', '-x86', 'x86_64', '-ia32', '-i64', '-war', '-64bit', '-arm64bit', '-doc', '-apidocs', '-bundle'] # stems that match these strings are not source archives: NON_SOURCE_MATCH = ['-bin-', '-binary-', '-docs-', 'x86-windows', 'x64-windows'] # Warning: beware of accidentally matching Maven plugins! # filters for dirs, matches and ends that may only apply to certain PMCs CTTEE_FILTERS = { "solr": { "ENDS": ['-slim'], "MATCH": [], "DIRS": ['helm-charts'] } } # Don't visit these directories SKIP_DIRS = ['META', 'aarch64current', 'bin', 'binaries', 'binary', 'changes', 'cpp', 'css', 'doc', 'docs', 'eclipse', 'features', 'hidden', 'images', 'issuesfixed', 'notes', 'patches', 'php', 'py', 'py3', 'repos', 'ruby', 'stable', 'stable1', 'stable2', 'styles', 'tmp', 'updatesite', 'website', 'wikipages'] def parseFile(committeeId, file, date, path): parts = file.split('.') ext = parts.pop() # final extension if not ext in VALID_TYPES or (ext in TAR_TYPES and parts.pop() != 'tar'): return stem = ".".join(parts) # the filename stem without the archive suffice(s) if (any(stem.endswith(end) for end in NON_SOURCE_ENDS + CTTEE_FILTERS.get(committeeId,{}).get('ENDS',[])) or any(mat in stem for mat in NON_SOURCE_MATCH + CTTEE_FILTERS.get(committeeId,{}).get('MATCH',[]))): return filename = cleanFilename(stem) if len(filename) > 1: if filename not in releases[committeeId]: releases[committeeId][filename] = date files[committeeId][filename] = [] print(f" - {filename}\t\t\t{file}") files[committeeId][filename].append(path) def main(): uc = UrlCache(silent=True) find_ls = uc.get(FIND_LS, name='find-ls2.txt.gz') # -rw-rw-r-- 1 svnwc svnwc 479 2022-06-17 12:55 UTC ./.htaccess # 0 1 2 3 4 5 6 7 8 with gzip.open(find_ls, mode='rt') as r: for l in r: fields = l.split() # split the find line (the split drops the final LF) if not fields[0].startswith('-'): # only want plain files continue path = fields[8][2:] # last entry on line is the path; also drop the ./ prefix segs = path.split('/') if len(segs) == 1: # ignore top level files continue file = segs.pop() # basename # Ignore invisible files if file.startswith('.') or file in ['favicon.ico', 'META']: continue committeeId = segs[0] if any( seg in SKIP_DIRS + CTTEE_FILTERS.get(committeeId,{}).get('DIRS',[]) for seg in segs): # print('SKIP', segs) continue if committeeId in ['zzz']: continue if committeeId == 'incubator': podling = segs[1] committeeId = f'incubator-{podling}' # Now store the info stamp = fields[5] parseFile(committeeId, file, stamp, path) if __name__ == '__main__': mypath = realpath(__file__) assert '/scripts/cronjobs/' in mypath, "Expected this source file to be under scripts/cronjobs !" myhome = dirname(dirname(dirname(mypath))) # home dir is ../.. jsondir = join(myhome, 'site', 'json', 'foundation') # where the JSON files go main() print("Writing releases.json") with open(join(jsondir, "releases.json"), "w", encoding='utf-8') as f: json.dump(releases, f, sort_keys=True, indent=0) print("Writing releases-files.json") with open(join(jsondir, "releases-files.json"), "w", encoding='utf-8') as f: json.dump(files, f, sort_keys=True, indent=0) print("All done!")

scripts/cronjobs/parsereleases.py (96 lines of code) (raw):