task_processor.py

# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. import os import re import glob import json TESTING = 0 SILENT = False def log(msg): # Output message if we are not running on silent mode global SILENT if not SILENT: print(msg) def pattern_match(name, artifacts_to_get): """ Match an artifact that was requested with the name we have. """ if not artifacts_to_get: return None for aname in artifacts_to_get: if aname in name: return aname return None def sorted_nicely(data): """ Sort the given iterable in the way that humans expect. """ convert = lambda text: int(text) if text.isdigit() else text alphanum_key = lambda key: [convert(c) for c in re.split("([0-9]+)", key)] return sorted(data, key=alphanum_key) def match_vismets_with_videos(task_group_id, path, vismet_task_ids): """ Returns a mapping from vismet task IDs to the videos. """ task_dir = os.path.join(path, task_group_id) taskgraph_json = os.path.join(task_dir, "task-group-information.json") with open(taskgraph_json) as f: taskgraph = json.load(f) # First filter down to only browsertime tasks mapping = {task_id: None for task_id in vismet_task_ids} for task in taskgraph: task_id = task.get("status", {}).get("taskId", "") if task_id not in mapping: continue vismet_fetches = json.loads(task["task"]["payload"]["env"]["MOZ_FETCHES"]) for fetch in vismet_fetches: if "browsertime-results" in fetch["artifact"]: mapping[task_id] = fetch["task"] break if all(mapping): break return mapping def get_task_data_paths( task_group_id, path, run_number=None, artifact=[], artifact_dir="", suite_matcher="", silent=False, ): """ Opens a folder for a task group and returns the files contained within it. """ global SILENT SILENT = silent if type(artifact) not in (list,): artifact = [artifact] data = {} # Get the directory to search task_dir = os.path.join(path, task_group_id) if not os.path.exists(task_dir): log("Cannot open task directory: %s" % task_dir) return if run_number is None: curr_dir = os.getcwd() os.chdir(task_dir) dir_list = next(os.walk("."))[1] max_num = 0 for subdir in dir_list: run_num = int(subdir) if run_num > max_num: max_num = run_num os.chdir(curr_dir) run_number = max_num log("No run number supplied. Using the latest one, run number %s" % run_number) run_dir = os.path.join(task_dir, str(run_number)) all_suites = [ f for f in os.listdir(run_dir) if os.path.isdir(os.path.join(run_dir, f)) ] # Find all the data for this task group for suite in all_suites: for aname in artifact: if suite_matcher and suite_matcher not in suite: continue suite_dir = os.path.join(run_dir, suite) # Get the suite's data directory if not artifact_dir: artifact_dir = aname all_dirs = [ f for f in os.listdir(suite_dir) if os.path.isdir(os.path.join(suite_dir, f)) ] suite_data_dir = None for d in all_dirs: if pattern_match(d, [aname]) or ( not artifact_dir and d.endswith("_data") ): suite_data_dir = os.path.join(suite_dir, d) break if not suite_data_dir: log("Cannot find data directory in %s, skipping" % suite_dir) continue # Now find all data files and order them all_files = glob.glob(os.path.join(suite_data_dir, "**/*"), recursive=True) all_files = [ file for file in all_files if artifact and pattern_match(os.path.split(file)[-1], [aname]) ] if suite not in data: data[suite] = [] data[suite].extend(all_files) data[suite] = sorted_nicely(data[suite]) return data def get_task_data( task_group_id, path, run_number=None, artifact="", suite_matcher="", silent=False ): """ Get the task data paths and opens the data into a detected file format. By default, when an unknown file format is encountered, the lines will be read and returned. """ global SILENT SILENT = silent data = {} data_paths = get_task_data_paths( task_group_id, path, run_number=run_number, artifact=artifact, suite_matcher=suite_matcher, silent=silent, ) for suite, paths in data_paths.items(): data[suite] = [] for path in paths: tmpdata = None log("Opening %s..." % path) if path.endswith(".json"): with open(path, "r") as f: tmpdata = json.load(f) else: with open(path, "r") as f: tmpdata = f.readlines() data[suite].append({"data": tmpdata, "file": path}) return data if __name__ == "__main__": if TESTING: data = get_task_data_paths( "SssyewAFQiKm40PIouxo_g", "/home/sparky/mozilla-source/analysis-scripts/perfunct-testing-data", artifact="perfherder-data", run_number="4", ) print(json.dumps(data, indent=4)) data = get_task_data( "SssyewAFQiKm40PIouxo_g", "/home/sparky/mozilla-source/analysis-scripts/perfunct-testing-data", artifact="perfherder-data", run_number="4", )

task_processor.py (148 lines of code) (raw):