in src/server/helpers/user_reports_transform.ts [85:176]
export function transformUserReports(
rawReports: any[],
rawUrlPatterns: any[],
probabilityThreshold: number,
logger: Logger,
) {
logger.verbose("Pre-processing URL patterns...");
const preprocessedUrlPatterns = rawUrlPatterns.map((pattern: UrlPattern) => {
const newPattern = Object.assign({}, pattern);
// [ToDo] We probably should build actual RegExp matching, so this should
// generate a matchable RegExp
newPattern.url_pattern = pattern.url_pattern.replace("*", "");
return newPattern;
});
logger.verbose("Pre-processing user reports...");
const preprocessedReports = rawReports
.filter((report: UserReport) => {
// [ToDo] some reports currently don't have a URL attached. This breaks
// all kinds of assumptions here, so let's remove them for now. Tom is
// investigating why this happens.
return !!report.url;
})
.map((report: UserReport) => {
const newReport = Object.assign({}, report);
// For some reason, the reported_at as it comes out of the database is
// actually an object {value: "[timestamp]"}.
// [ToDo] figure out why, and if this is something that could change
newReport.reported_at = (report as any).reported_at.value;
newReport.related_bugs = preprocessedUrlPatterns
.filter((pattern) => report.url.includes(pattern.url_pattern))
.map((pattern) => ({
number: pattern.bug,
title: pattern.title,
}));
return newReport;
});
logger.verbose("Grouping reports by root domain...");
const normalizeHostname = _.memoize((hostname: string) => {
if (isIP(hostname)) {
return hostname;
}
const parsedDomain = psl.parse(hostname);
return (parsedDomain as psl.ParsedDomain).domain || "[unknown]";
});
const groupedByDomainDict = _.groupBy(preprocessedReports, (report) => {
try {
const parsedUrl = new URL(report.url);
return normalizeHostname(parsedUrl.hostname);
} catch {
return "[unknown]";
}
});
logger.verbose("Transforming grouped Dictionary into Object");
const groupedByDomain = Object.entries(groupedByDomainDict).map(([root_domain, reports]) => {
const reportSubset = reports
// First, let's filter out all the reports we don't want to triage:
// - anything labeled as invalid by our ML model
// - reports without a comment
.filter(
(report) =>
!!report.comments &&
(report.prediction == "valid" || (report.prediction == "invalid" && report.prob < probabilityThreshold)),
)
// Then, slice the first 10 reports out, then remove all reprots that have
// been actioned upon. We do it in this order to make sure that there there
// won't be a new set of 10 issues after all of them have been worked on.
.slice(0, 10)
.filter((report) => !report.has_actions);
return {
root_domain,
// Note: this is the count of *all* reports, even ones we filtered out.
reports_count: reports.length,
reports: reportSubset,
};
});
logger.verbose("Sorting by the total number of reports per domain...");
const sorted = groupedByDomain.sort((a, b) => b.reports_count - a.reports_count);
logger.verbose("Writing response...");
return JSON.stringify(sorted);
}