export function transformUserReports()

in src/server/helpers/user_reports_transform.ts [85:176]


export function transformUserReports(
  rawReports: any[],
  rawUrlPatterns: any[],
  probabilityThreshold: number,
  logger: Logger,
) {
  logger.verbose("Pre-processing URL patterns...");
  const preprocessedUrlPatterns = rawUrlPatterns.map((pattern: UrlPattern) => {
    const newPattern = Object.assign({}, pattern);

    // [ToDo] We probably should build actual RegExp matching, so this should
    // generate a matchable RegExp
    newPattern.url_pattern = pattern.url_pattern.replace("*", "");

    return newPattern;
  });

  logger.verbose("Pre-processing user reports...");
  const preprocessedReports = rawReports
    .filter((report: UserReport) => {
      // [ToDo] some reports currently don't have a URL attached. This breaks
      // all kinds of assumptions here, so let's remove them for now. Tom is
      // investigating why this happens.
      return !!report.url;
    })
    .map((report: UserReport) => {
      const newReport = Object.assign({}, report);

      // For some reason, the reported_at as it comes out of the database is
      // actually an object {value: "[timestamp]"}.
      // [ToDo] figure out why, and if this is something that could change
      newReport.reported_at = (report as any).reported_at.value;

      newReport.related_bugs = preprocessedUrlPatterns
        .filter((pattern) => report.url.includes(pattern.url_pattern))
        .map((pattern) => ({
          number: pattern.bug,
          title: pattern.title,
        }));

      return newReport;
    });

  logger.verbose("Grouping reports by root domain...");
  const normalizeHostname = _.memoize((hostname: string) => {
    if (isIP(hostname)) {
      return hostname;
    }

    const parsedDomain = psl.parse(hostname);
    return (parsedDomain as psl.ParsedDomain).domain || "[unknown]";
  });
  const groupedByDomainDict = _.groupBy(preprocessedReports, (report) => {
    try {
      const parsedUrl = new URL(report.url);
      return normalizeHostname(parsedUrl.hostname);
    } catch {
      return "[unknown]";
    }
  });

  logger.verbose("Transforming grouped Dictionary into Object");
  const groupedByDomain = Object.entries(groupedByDomainDict).map(([root_domain, reports]) => {
    const reportSubset = reports
      // First, let's filter out all the reports we don't want to triage:
      //   - anything labeled as invalid by our ML model
      //   - reports without a comment
      .filter(
        (report) =>
          !!report.comments &&
          (report.prediction == "valid" || (report.prediction == "invalid" && report.prob < probabilityThreshold)),
      )
      // Then, slice the first 10 reports out, then remove all reprots that have
      // been actioned upon. We do it in this order to make sure that there there
      // won't be a new set of 10 issues after all of them have been worked on.
      .slice(0, 10)
      .filter((report) => !report.has_actions);

    return {
      root_domain,
      // Note: this is the count of *all* reports, even ones we filtered out.
      reports_count: reports.length,
      reports: reportSubset,
    };
  });

  logger.verbose("Sorting by the total number of reports per domain...");
  const sorted = groupedByDomain.sort((a, b) => b.reports_count - a.reports_count);

  logger.verbose("Writing response...");
  return JSON.stringify(sorted);
}