in src/datatrove/tools/failed_logs.py [0:0]
def main():
"""
Takes a `logging_dir` as input, gets total number of tasks from `executor.json` and then gets which ranks are
incomplete by scanning `logging_dir/completions`. The log files for the incomplete tasks are then displayed.
"""
args = parser.parse_args()
console = Console()
logger.remove()
logging_dir = get_datafolder(args.path)
if not logging_dir.isfile("executor.json"):
console.log(
'Could not find "executor.json" in the given directory. Are you sure it is a logging folder?',
style="red",
)
return
with logging_dir.open("executor.json", "rt") as f:
world_size = json.load(f).get("world_size", None)
if not world_size:
console.log("Could not get the total number of tasks, please try relaunching the run.", style="red")
return
console.log(f"Found executor config: {world_size} tasks")
with console.status("Fetching list of incomplete tasks"):
completed = set(logging_dir.list_files("completions"))
incomplete = set(filter(lambda rank: f"completions/{rank:05d}" not in completed, range(world_size)))
console.log(f"Found {len(incomplete)}/{world_size} incomplete tasks.")
with console.status("Looking for log files"):
incomplete_logs = list(
filter(
lambda file: int(RANK_FROM_LOG_FILENAME_REGEX.search(file).group(1)) in incomplete,
logging_dir.list_files("logs"),
)
)
console.log(f"Found {len(incomplete_logs)} log files for incomplete tasks.")
first = True
for incomplete_log in incomplete_logs:
if not first and not Confirm.ask(f"Show next log ([i]{incomplete_log}[/i])?", default=True):
break
with console.pager():
with logging_dir.open(incomplete_log, "rt") as f:
console.print(f.read())
first = False