generate-attribution/generate-attribution-file.js (471 lines of code) (raw):

const path = require('path'); const fs = require('fs'); const fsPromises = fs.promises; const {parse} = require('csv-parse/sync') const HTMLParser = require('node-html-parser'); const { https } = require('follow-redirects'); const { promisify } = require('util'); const retry = require('async-retry') const glob = require("glob-promise") const homedir = require('os').homedir(); const {setImmediate} = require("timers/promises"); // https://github.com/google/licenseclassifier/blob/842c0d70d7027215932deb13801890992c9ba364/license_type.go#L323 const RECIPROCAL_LICENSE_TYPES = ["APSL-1.0", "APSL-1.1", "APSL-1.2", "APSL-2.0", "CDDL-1.0", "CDDL-1.1", "CPL-1.0", "EPL-1.0", "FreeImage", "IPL-1.0", "MPL-1.0", "MPL-1.1", "MPL-2.0", "Ruby"]; const typesCanBeMergedWithoutCopyRight = ['MIT']; const sortByModule = (a, b) => a.module.localeCompare(b.module); const cacheFile = path.join(homedir, ".generate-attribution", '.cache'); let httpCache = {}; https.get[promisify.custom] = function getAsync(options) { return new Promise((resolve, reject) => { https.get(options, (response) => { response.end = new Promise((resolve) => response.on('end', resolve)); resolve(response); }).on('error', reject); }); }; const get = promisify(https.get); const doesRequireSourceLink = (licenseType) => { return RECIPROCAL_LICENSE_TYPES.findIndex((type) => licenseType.startsWith(type)) !== -1; } const isLicenseFuzzyMatch = (licenseA, linceseB) => { return licenseA.replace(/\s/g, '') === linceseB.replace(/\s/g, ''); } const moduleTemplate = (mod) => { return ` ** ${mod.moduleOverride ?? mod.module}; version ${mod.version} -- ${mod.repository} `; } const noticeTemplate = (mod) => { if (!mod.noticeContent) return ''; return ` * For ${mod.module} see also this required NOTICE: ${mod.noticeContent} ` } const copyrightsTemplate = (mod) => { let value = ''; mod.copyrights.forEach((copyright) => { value += `${copyright}\n` }); return value; } const sourceCodeTemplate = (mod) => { return ` * Package ${mod.module}'s source code may be found at: ${mod.repository}/tree/${mod.version} ` } const extractCopyRights = (license) => { license = license.replace(/(The |^)MIT License.*$/m, ''); const matches = license.match(/(Copyright \([cC]\).*)/g); if (matches) { matches.forEach((match) => { license = license.replace(match, ''); }); } license = license.replace(/^ +/gm, '').trim(); return { licenseContent: license, copyrights: matches }; } const throttledConcurrencyPromiseAll = async (arr, f, n) => { const results = Array(arr.length); const entries = arr.entries(); const worker = async () => { for (const [key, val] of entries) results[key] = await f(val); }; await Promise.all(Array.from({ length: Math.min(arr.length, n) }, worker)); return results; }; const parseRepoURL = (repo, stripHttps = false) => { const parts = repo.split(' '); let repoURL = parts[parts.length - 1]; // remove https prefix if (stripHttps && repoURL.startsWith('https://')) { repoURL = repoURL.substring(8); } // remove trailing .git if (repoURL.endsWith('.git')) { repoURL = repoURL.substring(0, repoURL.length - 4); } return repoURL; } const generateDependencyAttribution = (dep) => { let attributionOutput = ''; if (!dep.module) { console.log("NOTICE: Missing module", dep); } else if (!dep.repository) { console.log("NOTICE: Missing repository", dep); } else if (!dep.licenseContent) { console.log("NOTICE: Missing licenseContent", dep); } else { if (!dep.version) { console.log("NOTICE: Missing version, check it out", dep.module); dep.version = 'v0.0.0'; } attributionOutput += moduleTemplate(dep); if (doesRequireSourceLink(dep.licenseType)) { attributionOutput += sourceCodeTemplate(dep); } if (dep.copyrights) { attributionOutput += copyrightsTemplate(dep); } } return attributionOutput; } const cleanLicense = (fileName, licenseType, content) => { if (fileName !== 'README.txt' || licenseType != 'BSD-3-Clause') { return content; } // Some packages do not have license files. go-license correctly finds the license // in the readme, but there is additional content that does not need to be in the attribution file // strip that out here const match = content.match(/Copyright.*DAMAGE\./s); if (match.length) { return match[0]; } console.log("NOTICE: readme.txt did not match expect license regex. Check it out"); } const fixEmptyModule = (dependencies) => { // some root packages come in as empty string from go-licenses dependencies.forEach((dep) => { if (dep.module === '') { dep.modulePath = rootModuleName; dep.module = rootModuleName; } }); return dependencies; } async function addGoLicense(dependencies) { // go-licenses excludes the stdlib when pulling deps, adding the golang license to all to cover for this const goLicensePath = `https://github.com/golang/go/blob/${goLangVersion}/LICENSE`; dependencies.push({ "module": "golang.org/go", "licensePath": goLicensePath, "licenseType": "BSD-3-Clause", "version": `${goLangVersion}`, "modulePath": "golang.org/go", "repository": "https://github.com/golang/go", "licenseContent": await readLicenseFromUpstream(`${goLicensePath}?raw=true`) }); return dependencies; } async function cacheHttp(url, fn) { if (!Object.hasOwn(httpCache, url)) { res = await retry(fn, { retries: 5, onRetry: (err, num) => { console.log(`NOTICE: retry attempt ${num} for ${url} due to ${err}`); } }); httpCache[url] = res; } await setImmediate(); return httpCache[url]; } async function readLicenseFromUpstream(upstreamUrl) { let finalDoc = ''; const options = await generateAuthorizationHeader() options.timeout = 15 * 1000 return cacheHttp(upstreamUrl, async () => { const res = await get(upstreamUrl, options); res.on('data', d => { finalDoc += d; }) await res.end; return finalDoc; }); } async function getPackageRepo(package) { let finalDoc = ''; let url = `https://${package}?go-get=1` const options = await generateAuthorizationHeader() options.timeout = 15 * 1000 return cacheHttp(url, async (bail, num) => { try { // Github doesnt seem to return the go-import for sub packages, only request for root packages if (package.startsWith('github.com') && package.split("/").length > 3) { return parseRepoURL(`https://${package}`); } const res = await get(url, options); res.on('data', d => { finalDoc += d; }); await res.end; if (res.statusCode === 429) { throw new Error("Rate limited"); } if (res.statusCode === 404) { console.log('NOTICE: request to get package url return invalid response', res.statusCode, url); return `https://${package}`; } if (res.statusCode !== 200) { throw new Error("invalid response", res.statusCode); } const htmlDoc = HTMLParser.parse(finalDoc); const metaTag = htmlDoc.querySelector('head meta[name=go-import]') if (metaTag) { return parseRepoURL(metaTag.getAttribute('content')); } else { return `https://${package}`; } } catch (err) { // if the domain is not resolved, only retry 3 times if (err.code === 'ENOTFOUND' && num > 2) { return bail(err); } throw err } }); } async function readLicenseContent(dep, depLicensesDirPath) { const possiblePaths = []; if (dep.licensePath !== 'Unknown') { const licensePathFromGoLicenseOutput = path.join(depLicensesDirPath, path.basename(dep.licensePath)); possiblePaths.push(licensePathFromGoLicenseOutput); } const files = await glob('LICEN+(S|C)E?(.md|.txt)', { cwd: depLicensesDirPath, nocase: true }) files.forEach((file) => { possiblePaths.push(path.join(depLicensesDirPath, file)); }); for (let i = 0; i < possiblePaths.length; i++) { const possiblePath = possiblePaths[i]; try { await fsPromises.access(possiblePath); const licenseText = await fsPromises.readFile(possiblePath, 'utf8'); return cleanLicense(path.basename(possiblePath), dep.licenseType, licenseText); } catch { } } console.log('No license file for', dep); process.exit(1); } async function readNoticeFile(depLicensesDirPath) { const noticePath = path.join(depLicensesDirPath, 'NOTICE'); try { await fsPromises.access(noticePath); return await fsPromises.readFile(noticePath, 'utf8'); } catch { } } async function parseCSV() { const csvFilePath = path.join(projectAttributionDirectory, 'go-license.csv'); const csvContent = await fsPromises.readFile(csvFilePath, 'utf8'); const dependencies = parse(csvContent, { columns: ['module', 'licensePath', 'licenseType'] }); return dependencies; } async function populateVersionAndModuleFromDep(dependencies) { const goListDepFilePath = path.join(projectAttributionDirectory, 'go-deps.json'); const goListDepFileContent = await fsPromises.readFile(goListDepFilePath, 'utf8'); const goListDeps = JSON.parse(goListDepFileContent); const isModuleMatch = (dep, goListDep, allowPrefixMatch = false) => { if (!goListDep.Module) return false; return dep.module === goListDep.Module.Path || dep.module === goListDep.ImportPath || (allowPrefixMatch && dep.module.startsWith(`${goListDep.Module.Path}/pkg`)) || (allowPrefixMatch && dep.module.startsWith(goListDep.Module.Path)) } const getDepVersion = (goListDep) => { return goListDep.Module.Replace?.Version ?? goListDep.Module.Version; } const isVersionMismatch = (depVersion, goDepVersion) => { if (!depVersion || !goDepVersion) { return false; } return depVersion !== goDepVersion; } const isPathMismatch = (dep, goListDep) => { return dep.modulePath !== goListDep.Module.Path && dep.modulePath !== goListDep.Module.Replace?.Path; } const isRelativePath = (path) => { if (!path) return false; return path.startsWith('./') || path.startsWith('../'); } const useReplacePath = (goListDep) => { // some replace paths end up being local to the repo // and start with ./ in that case leave the module alone // otherwise the replace module path is more accurate return goListDep.Module.Replace?.Path && !isRelativePath(goListDep.Module.Replace?.Path) && goListDep.Module.Replace.Path !== goListDep.Module.Path; } const handleFound = (dep, goListDep, found) => { const goDepVersion = getDepVersion(goListDep); const bothVersionsUndef = dep.Version ?? goDepVersion; if (found && ( isVersionMismatch(dep.version, goDepVersion) || isPathMismatch(dep, goListDep) ) ) { console.log("NOTICE: Dep matched go list more than once. Check it out", dep, goListDep) } dep.version ??= goDepVersion dep.modulePath = useReplacePath(goListDep) ? goListDep.Module.Replace.Path : goListDep.Module.Path; dep.moduleOverride = useReplacePath(goListDep) ? goListDep.Module.Replace.Path : goListDep.module; } const finalDeps = []; dependencies.forEach((dep) => { let found = false; if (dep.version) { // the package itself which was added using the GIT_TAG finalDeps.push(dep); return; } goListDeps.forEach((goListDep) => { if (isModuleMatch(dep, goListDep)) { handleFound(dep, goListDep, found); found = true; } }); if (!found) { let match; goListDeps.forEach((goListDep) => { // these matches were found by the prefix match above // find the longest prefix and use that as our module if (isModuleMatch(dep, goListDep, true)) { if (!match || goListDep.Module.Path.length > match.Module.Path.length) { match = goListDep; } } }); if (match) { handleFound(dep, match, found); found = true; } } if (!found) { console.log("ERROR: Dep from go-license.csv was not found. Check it out", dep); process.exit(1); } else { finalDeps.push(dep); } }); return finalDeps; } async function generateAuthorizationHeader() { if (process.env.GITHUB_TOKEN) { return { headers: { 'Authorization': 'token ' + process.env.GITHUB_TOKEN } }; } const githubTokenFile = "/secrets/github-secrets/token"; try { await fsPromises.access(githubTokenFile); const githubToken = await fsPromises.readFile(githubTokenFile, 'utf8'); const options = { headers: { 'Authorization': 'token ' + githubToken } }; return options; } catch { return {}; } } async function populateRootComponentVersion(dependencies) { const version = await fsPromises.readFile(gitTagPath, 'utf8'); dependencies.forEach((dep) => { if (dep.modulePath.startsWith(rootModuleName) && !dep.version) { dep.version = version.trim(); } }); return dependencies; } async function populateLicenseAndNoticeContent(dependencies) { // For the apache license we can hardcode this since it is supposed to be unedited const officialApacheLicensePath = path.join(__dirname, 'LICENSE-2.0.txt'); const officialApacheLicense = await fsPromises.readFile(officialApacheLicensePath, 'utf-8'); for (let i = 0; i < dependencies.length; i++) { const dep = dependencies[i]; const depLicensesDirPath = path.join(projectLicensesDirectory, dep.module); if (!dep.modulePath) { console.log("Dep has no module path", dep); process.exit(1); } if (dep.licenseType === 'Apache-2.0') { dep.licenseContent = officialApacheLicense; } else { dep.licenseContent = await readLicenseContent(dep, depLicensesDirPath); } dep.noticeContent = await readNoticeFile(depLicensesDirPath); } return dependencies; } async function populateRepoURLs(dependencies) { await throttledConcurrencyPromiseAll(dependencies, async (dep) => { try { dep.repository = await getPackageRepo(dep.modulePath) } catch (e) { console.log('NOTICE: error pulling package repo double check result for', dep.modulePath, e); dep.repository = `https://${dep.modulePath}`; } }, 10); return dependencies } async function groupByLicense(dependencies) { const uniqLicenses = {}; const sortedDeps = dependencies.sort(sortByModule); sortedDeps.forEach(function (dep) { const canBeMerged = typesCanBeMergedWithoutCopyRight.indexOf(dep.licenseType) !== -1; if (canBeMerged) { // If the differnce in content is only the copyright we can merge them into the same group const { licenseContent, copyrights } = extractCopyRights(dep.licenseContent); dep.copyrights = copyrights; dep.licenseContent = licenseContent; } let uniqueLicense = Object.entries(uniqLicenses) .find(([licenseType, { licenseContent }]) => { return (canBeMerged && isLicenseFuzzyMatch(dep.licenseContent, licenseContent)) || (dep.licenseContent === licenseContent) }); let type = uniqueLicense ? uniqueLicense[0] : dep.licenseType; if (!uniqueLicense) { if (uniqLicenses[type]) { // Same license type but different content, jsut add module name to type to factor in for sorting later type = `${dep.licenseType}+${dep.module}` } uniqLicenses[type] = { licenseContent: dep.licenseContent, deps: [] }; } uniqLicenses[type].deps.push(dep); }); return uniqLicenses; } async function generateAttribution(dependenciesByLicenseType) { let attributionOutput = ''; let summaryOutput = ''; const sortedLicenseTypes = Object.keys(dependenciesByLicenseType).sort((a, b) => a.localeCompare(b)); sortedLicenseTypes.forEach((licenseType) => { const { deps, licenseContent } = dependenciesByLicenseType[licenseType]; const sortedDeps = deps.sort(sortByModule); const requiresSourceCodeLink = doesRequireSourceLink(licenseType); sortedDeps.forEach(function (dep) { attributionOutput += generateDependencyAttribution(dep); }); attributionOutput += "\n" + licenseContent + "\n"; sortedDeps.forEach(function (dep) { attributionOutput += noticeTemplate(dep); }); attributionOutput += "\------\n"; summaryOutput += `${licenseType} => ${sortedDeps.length}\n` }); await fsPromises.writeFile(path.join(projectAttributionDirectory, "summary.txt"), summaryOutput); return fsPromises.writeFile(path.join(projectAttributionDirectory, "ATTRIBUTION.txt"), attributionOutput); } async function loadHttpCache() { try { await fsPromises.access(cacheFile); } catch { return } const data = await fsPromises.readFile(cacheFile); httpCache = JSON.parse(data); } async function saveHttpCache() { await fsPromises.mkdir(path.dirname(cacheFile), { recursive: true }); return fsPromises.writeFile(cacheFile, JSON.stringify(httpCache)); } async function execute() { return parseCSV() .then(fixEmptyModule) .then(populateVersionAndModuleFromDep) .then(populateRootComponentVersion) .then(populateLicenseAndNoticeContent) .then(populateRepoURLs) .then(addGoLicense) .then(groupByLicense) .then(generateAttribution) } const rootModuleName = parseRepoURL(process.argv[2], true); const projectDirectory = process.argv[3]; const goLangVersion = process.argv[4]; const projectOutputDirectory = process.argv[5]; const gitTagPath = path.join(projectDirectory, 'GIT_TAG'); const projectLicensesDirectory = path.join(projectOutputDirectory, "LICENSES"); const projectAttributionDirectory = path.join(projectOutputDirectory, "attribution"); loadHttpCache() .then(execute) .then(saveHttpCache);