src/scripts/build/uploadAutoCompleteLocations.js (322 lines of code) (raw):
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/**
* The data that is being downloaded in this script is provided by
* GeoNames (https://www.geonames.org/). We are using the data to compile a set
* of US cities and states that match the needs of this project. Their work is
* licensed under a Creative Commons Attribution 4.0 License:
* https://creativecommons.org/licenses/by/4.0/.
*
* All database dumps and table definitions can be found here:
* https://download.geonames.org/export/dump/.
*/
import https from "https";
import {
createWriteStream,
existsSync,
mkdirSync,
readFileSync,
rmSync,
writeFileSync,
} from "fs";
import Sentry from "@sentry/nextjs";
import os from "os";
import path from "path";
import AdmZip from "adm-zip";
import { uploadToS3 } from "../../utils/s3.js";
const REMOTE_DATA_URL = "https://download.geonames.org/export/dump";
const DATA_COUNTRY_CODE = "US";
const LOCATIONS_DATA_FILE = "locationAutocompleteData.json";
const FETCH_REMOTE_DATASETS = true;
const CLEANUP_TMP_DATA_AFTER_FINISHED = true;
const SENTRY_SLUG = "cron-create-location-autocomplete";
Sentry.init({
environment: process.env.APP_ENV,
dsn: process.env.SENTRY_DSN,
tracesSampleRate: 1.0,
});
const checkInId = Sentry.captureCheckIn({
monitorSlug: SENTRY_SLUG,
status: "in_progress",
});
// Only include populated places that are a city, town, village, or another
// agglomeration of buildings where people live and work.
// Feature classes and codes: http://www.geonames.org/export/codes.html
const allowedFeatureClass = "P";
const allowedFeatureCodes = [
"PPL",
"PPLA",
"PPLA2",
"PPLA3",
"PPLA4",
"PPLC",
"PPLL",
];
/**
* Logs the progress of a task.
*
* @param {number} currentCount - The current count.
* @param {number} totalCount - The total count.
*/
function logProgress(currentCount, totalCount) {
const progress = Math.round(((currentCount + 1) / totalCount) * 100);
process.stdout.write(
`-> ${currentCount + 1} / ${totalCount} (${progress}%) \r`,
);
}
/**
* Writes the content of a remote file to a local write stream.
*
* @param {Object} param - The parameters for the function.
* @param {string} param.url - The URL of the remote file.
* @param {import("fs").WriteStream} param.writeStream - The write stream the file content is written to.
* @returns {Promise<unknown>} Resolves when the file has been written.
*/
function writeFromRemoteFile({ url, writeStream }) {
return new Promise((resolve, reject) => {
https.get(url, (res) => {
res.on("end", () => {
writeStream.close(() => resolve(res));
});
res.on("error", (error) => {
reject(error);
});
res.pipe(writeStream);
});
});
}
/**
* Fetches the remote archive.
*
* @param {Object} param - The parameters for the function.
* @param {string} param.remoteArchiveUrl - The URL of the remote archive.
* @param {string} param.localDownloadPath - The local path where the file will be downloaded.
* @param {string} param.localExtractionPath - The local path where the archive will be extracted.
* @returns {Promise<any>} Resolves when the extraction is complete.
*/
async function fetchRemoteArchive({
remoteArchiveUrl,
localDownloadPath,
localExtractionPath,
}) {
console.info(
`Downloading remote file: ${remoteArchiveUrl} -> ${localDownloadPath}`,
);
await writeFromRemoteFile({
url: remoteArchiveUrl,
writeStream: createWriteStream(localDownloadPath),
});
console.info(`Extracting: ${localDownloadPath} -> ${localExtractionPath}`);
const zip = new AdmZip(localDownloadPath);
await new Promise((resolve, reject) => {
zip.extractAllToAsync(localExtractionPath, true, false, (error) =>
error ? reject(error) : resolve(localExtractionPath),
);
});
}
try {
console.info("Create autocomplete location data");
const startTime = Date.now();
const tmpDirPath = path.join(os.tmpdir(), "fx-monitor");
console.info(`Creating data directory: ${tmpDirPath}`);
if (!existsSync(tmpDirPath)) {
mkdirSync(tmpDirPath);
}
const localDestinationPath = {
locations: `${tmpDirPath}/locations-${DATA_COUNTRY_CODE}-extracted`,
alternateNames: `${tmpDirPath}/alternatenames-${DATA_COUNTRY_CODE}-extracted`,
hierarchy: `${tmpDirPath}/hierarchy-extracted`,
};
if (FETCH_REMOTE_DATASETS) {
console.info("Downloading all locations");
await fetchRemoteArchive({
remoteArchiveUrl: `${REMOTE_DATA_URL}/${DATA_COUNTRY_CODE}.zip`,
localDownloadPath: `${tmpDirPath}/locations-${DATA_COUNTRY_CODE}.zip`,
localExtractionPath: localDestinationPath.locations,
});
console.info("Downloading alternate names");
await fetchRemoteArchive({
remoteArchiveUrl: `${REMOTE_DATA_URL}/alternatenames/${DATA_COUNTRY_CODE}.zip`,
localDownloadPath: `${tmpDirPath}/alternatenames-${DATA_COUNTRY_CODE}.zip`,
localExtractionPath: localDestinationPath.alternateNames,
});
console.info("Downloading hierachy data");
await fetchRemoteArchive({
remoteArchiveUrl: `${REMOTE_DATA_URL}/hierarchy.zip`,
localDownloadPath: `${tmpDirPath}/hierarchy.zip`,
localExtractionPath: localDestinationPath.hierarchy,
});
} else {
console.info("Skipping downloading remote data");
}
console.info("Reading file: Alternate location names");
const alternateNamesData = readFileSync(
`${localDestinationPath.alternateNames}/${DATA_COUNTRY_CODE}.txt`,
"utf8",
);
console.info("Parsing data: Alternate location names");
const alternateNameRows = alternateNamesData.split("\n");
const parsedAlternateNames = alternateNameRows
.map((alternateNamesLine) => {
const [
alternateNameId,
geonameId,
isolanguage,
alternateName,
isPreferredName,
_isShortName,
_isColloquial,
isHistoric,
_from,
_to,
] = alternateNamesLine.split("\t"); // lines are tab delimited
const isAbbreviation = isolanguage === "abbr";
const isRelevantAlternateName =
(isolanguage === "en" || isAbbreviation) && Number(isHistoric) !== 1;
if (isRelevantAlternateName) {
return {
id: alternateNameId,
alternateOf: geonameId,
name: alternateName,
isPreferredName,
};
}
return null;
})
.filter((alternateName) => alternateName !== null);
console.info("Reading file: Hierarchy");
const hierachyData = readFileSync(
`${localDestinationPath.hierarchy}/hierarchy.txt`,
"utf8",
);
console.info("Parsing data: Location hierarchy");
const hierachyDataRows = hierachyData.split("\n");
const hierarchyIds = hierachyDataRows.map((hierachyRow) => {
const [locationParentId, locationChildId, _hierachyType] =
hierachyRow.split("\t");
return [locationParentId, locationChildId];
});
console.info("Reading file: All locations");
const locationData = readFileSync(
`${localDestinationPath.locations}/${DATA_COUNTRY_CODE}.txt`,
"utf8",
);
console.info("Parsing data: All locations");
const locationDataRows = locationData.split("\n");
const locationRowCount = locationDataRows.length;
const locationDataPopulated = locationDataRows.reduce(
(
/** @type {Array<import("../../app/api/v1/location-autocomplete/types.ts").RelevantLocation>} */
relevantLocations,
location,
rowIndex,
) => {
logProgress(rowIndex, locationRowCount);
const [
geonameId,
name,
_asciiname,
_alternatenames,
_latitude,
_longitude,
featureClass,
featureCode,
_countryCode,
_cc2,
admin1Code,
_admin2Code,
_admin3Code,
_admin4Code,
population,
_elevation,
_dem,
_timezone,
_modificationDate,
] = location.split("\t"); // lines are tab delimited
const isPopulatedPlaceOfInterest =
featureClass === allowedFeatureClass &&
allowedFeatureCodes.includes(featureCode);
if (isPopulatedPlaceOfInterest) {
const alternateNames = parsedAlternateNames.filter(
({ alternateOf, name: alternateName }) =>
alternateOf === geonameId && alternateName !== name,
);
const preferredName = alternateNames.find(
({ isPreferredName }) => isPreferredName === "1",
);
const alternateNamesFinal = alternateNames.map((alternateName) => {
// Include the original name as an alternative name if we’ll use an
// alternate name that is the preferred name.
if (preferredName && preferredName.name === alternateName.name) {
return name;
}
return alternateName.name;
});
// NOTE: Using short keys and only including entries when available
// keeps the resulting JSON significantly smaller.
relevantLocations.push({
id: geonameId,
// switch names if an alternate name is the preferred location name
n: preferredName ? preferredName.name : name,
s: admin1Code,
...(Number(population) > 0 && {
p: population,
}),
...(alternateNames &&
alternateNames.length > 0 && {
a: alternateNamesFinal,
}),
});
}
return relevantLocations;
},
[],
);
// Filter out locations that have another populated place as a parent.
console.info("Filtering by hierachy");
const locationDataPopulatedCount = locationDataPopulated.length;
const locationDataPopulatedTopLevel = locationDataPopulated.filter(
(locationPopulated, rowIndex) => {
logProgress(rowIndex, locationDataPopulatedCount);
const hasPopulatedParentLocation = hierarchyIds.some(
([parentId, childId]) => {
if (locationPopulated.id !== childId) {
return false;
}
return locationDataPopulated.some((location) => {
return (
location.id === parentId &&
// @ts-ignore FIXME: `featureClass` does not exist in `location`.
// The result of the top-level filter still returns the expected
// results for now.
location.featureClass === allowedFeatureClass
);
});
},
);
return !hasPopulatedParentLocation;
},
);
console.info(
`Number of relevant locations found: ${locationDataPopulatedTopLevel.length}`,
);
console.info(`Writing location data to file: ${LOCATIONS_DATA_FILE}`);
const locationDataFinal = {
name: "fx-monitor-location-autocomplete-data",
description:
"The data in this file is provided by GeoNames (https://www.geonames.org/). We are using the data to compile a set of US cities and states that match the needs of this project. Their work is licensed under a Creative Commons Attribution 4.0 License: https://creativecommons.org/licenses/by/4.0/. All database dumps and table definitions can be found here: https://download.geonames.org/export/dump/.",
created_at: startTime,
license: {
type: "CC BY 4.0",
url: "https://creativecommons.org/licenses/by/4.0/",
},
data: locationDataPopulatedTopLevel,
};
writeFileSync(LOCATIONS_DATA_FILE, JSON.stringify(locationDataFinal));
const fileBuffer = readFileSync(LOCATIONS_DATA_FILE);
if (process.argv.includes("--skip-upload")) {
console.debug("Skipping S3 upload");
} else {
await uploadToS3(`autocomplete/${LOCATIONS_DATA_FILE}`, fileBuffer);
}
if (CLEANUP_TMP_DATA_AFTER_FINISHED) {
console.info("Cleaning up data directory");
rmSync(tmpDirPath, {
recursive: true,
force: true,
});
}
const endTime = Date.now();
console.info(
`Created location data file successfully: Executed in ${
(endTime - startTime) / 1000
}s`,
);
} catch (error) {
console.error("Creating location file failed with:", error);
}
Sentry.captureCheckIn({
checkInId,
monitorSlug: SENTRY_SLUG,
status: "ok",
});
setTimeout(process.exit, 1000);