scripts/cronjobs/parsereleases.py (96 lines of code) (raw):
#!/usr/bin/env python3
import errtee # pylint: disable=unused-import
from collections import defaultdict
import gzip
import json
from os.path import dirname, join, realpath
from urlutils import UrlCache
"""
Reads the list of files in https://downloads.apache.org/zzz/find-ls.gz
Creates:
../../site/json/foundation/releases.json
Format:
{ top-level dir: { release-id: date}, ... }
The release id is derived from the filename by removing common suffixes etc, see cleanFilename()
The date comes from the first entry
../../site/json/foundation/releases-files.json
Format:
{ top-level dir: { release-id: [list of files for that release-id]}, ... }
"""
# Listing generated by find(1) -ls
FIND_LS = 'https://downloads.apache.org/zzz/find-ls2.txt.gz'
# key: committee-id, value: dict(key: release version, value: date)
releases = defaultdict(dict)
# key: committee-id, value: dict(key: release version, value: list of file names for the release)
files = defaultdict(lambda: defaultdict(list))
def cleanFilename(filename):
"""
Attempts to determine the release id to which a file belongs
Strips extensions such as .tgz etc, then suffixes such as -sources
Replaces qualifiers such as -assembly-, -parent- by '-'
Returns the simplified filename .
N.B. the ordering is significant, as the list is only scanned once
"""
for suffix in ['-all', '-src', '_src', '.src', '-sources', '_sources', '-source', '-bin', '-dist',
'-source-release', '-source-relase', '-tests', '-test', '-debug', '-uber',
'-macosx', '-distribution', '-example', '-native', '-win', '-win32', '-linux', '-pack', '-packaged', '-current', '-embedded',
'-py', '-py2', '-py2.6', '-py2.7', '-no', 'unix-distro', 'windows-distro', 'with', '-dep', '-standalone', '-webapp', '-dom', '-om',
'-32bit', '-i386', '_i386', '.i386', '-minimal', '-jettyconfig', '-py2.py3-none-any', 'newkey', 'oldkey', 'jars', '-jre13', '-hadoop1', '-hadoop2', '-project',
'-with-dependencies', '-client', '-server', 'server-webapps', '-full', '-all', '-standard', '-for-javaee', '-for-tomcat',
'hadoop1-scala2', '-deployer', '-fulldocs', '-embed', '-apps', '-app', '-ref', '-installer', '-bundle', '-java']:
# The above list could be simplified further
if filename.endswith(suffix):
filename = filename[0:len(filename)-len(suffix)]
for repl in ['-assembly-', '-minimal-', '-doc-', '-src-', '-webapp-', '-standalone-', '-parent-', '-project-', '-win32-']:
filename = filename.replace(repl, '-')
return filename
def cleanReleases(committeeId):
if len(releases[committeeId]) == 0:
del releases[committeeId]
del files[committeeId]
# all source releases must be one of these
VALID_TYPES = ['tgz', 'gz', 'zip', 'xz', 'bz2']
# for gz, xz and bz2, the next extension must be tar
TAR_TYPES = ['gz', 'xz', 'bz2']
# file name stems that finish with these strings are not source archives:
NON_SOURCE_ENDS = ['-amd64', '-aarch64', '-arm64', '.bin', '-bin', '-binary', '-deps', '-docs', '-javadoc', '-doc',
'-lib', '-lib-debug', '-manual', '-site', '-x64', '-x86', 'x86_64', '-ia32', '-i64',
'-war', '-64bit', '-arm64bit', '-doc', '-apidocs', '-bundle']
# stems that match these strings are not source archives:
NON_SOURCE_MATCH = ['-bin-', '-binary-', '-docs-', 'x86-windows', 'x64-windows']
# Warning: beware of accidentally matching Maven plugins!
# filters for dirs, matches and ends that may only apply to certain PMCs
CTTEE_FILTERS = {
"solr": {
"ENDS": ['-slim'],
"MATCH": [],
"DIRS": ['helm-charts']
}
}
# Don't visit these directories
SKIP_DIRS = ['META', 'aarch64current', 'bin', 'binaries', 'binary', 'changes', 'cpp', 'css', 'doc', 'docs',
'eclipse', 'features', 'hidden', 'images', 'issuesfixed', 'notes', 'patches', 'php', 'py', 'py3',
'repos', 'ruby', 'stable', 'stable1', 'stable2', 'styles', 'tmp', 'updatesite', 'website', 'wikipages']
def parseFile(committeeId, file, date, path):
parts = file.split('.')
ext = parts.pop() # final extension
if not ext in VALID_TYPES or (ext in TAR_TYPES and parts.pop() != 'tar'):
return
stem = ".".join(parts) # the filename stem without the archive suffice(s)
if (any(stem.endswith(end) for end in NON_SOURCE_ENDS + CTTEE_FILTERS.get(committeeId,{}).get('ENDS',[])) or
any(mat in stem for mat in NON_SOURCE_MATCH + CTTEE_FILTERS.get(committeeId,{}).get('MATCH',[]))):
return
filename = cleanFilename(stem)
if len(filename) > 1:
if filename not in releases[committeeId]:
releases[committeeId][filename] = date
files[committeeId][filename] = []
print(f" - {filename}\t\t\t{file}")
files[committeeId][filename].append(path)
def main():
uc = UrlCache(silent=True)
find_ls = uc.get(FIND_LS, name='find-ls2.txt.gz')
# -rw-rw-r-- 1 svnwc svnwc 479 2022-06-17 12:55 UTC ./.htaccess
# 0 1 2 3 4 5 6 7 8
with gzip.open(find_ls, mode='rt') as r:
for l in r:
fields = l.split() # split the find line (the split drops the final LF)
if not fields[0].startswith('-'): # only want plain files
continue
path = fields[8][2:] # last entry on line is the path; also drop the ./ prefix
segs = path.split('/')
if len(segs) == 1: # ignore top level files
continue
file = segs.pop() # basename
# Ignore invisible files
if file.startswith('.') or file in ['favicon.ico', 'META']:
continue
committeeId = segs[0]
if any( seg in SKIP_DIRS + CTTEE_FILTERS.get(committeeId,{}).get('DIRS',[]) for seg in segs):
# print('SKIP', segs)
continue
if committeeId in ['zzz']:
continue
if committeeId == 'incubator':
podling = segs[1]
committeeId = f'incubator-{podling}'
# Now store the info
stamp = fields[5]
parseFile(committeeId, file, stamp, path)
if __name__ == '__main__':
mypath = realpath(__file__)
assert '/scripts/cronjobs/' in mypath, "Expected this source file to be under scripts/cronjobs !"
myhome = dirname(dirname(dirname(mypath))) # home dir is ../..
jsondir = join(myhome, 'site', 'json', 'foundation') # where the JSON files go
main()
print("Writing releases.json")
with open(join(jsondir, "releases.json"), "w", encoding='utf-8') as f:
json.dump(releases, f, sort_keys=True, indent=0)
print("Writing releases-files.json")
with open(join(jsondir, "releases-files.json"), "w", encoding='utf-8') as f:
json.dump(files, f, sort_keys=True, indent=0)
print("All done!")