def process_submission()

in extras/plagiarism_detector/analyze.py [0:0]


def process_submission(submission, dolos_dir, timeout=500) -> tuple[str, list]:
    """
    Process a single submission for plagiarism detection.

    Args:
        submission (dict): A dictionary containing submission details, including 'competition_id' and 'code_path'.
        dolos_dir (str): The directory containing the dolos script.
        timeout (int): Timeout for the plagiarism detection process in seconds.

    Returns:
        tuple[str, list]: A tuple containing the code path and a list of plagiarism report items.
    """
    competition_id = submission["competition_id"]
    code_path = submission.get("code_path")

    if code_path is None:
        logger.warning(f"No code_path found for submission in competition {competition_id}")
        return None, None

    kernels_dir = registry.get_competitions_dir() / competition_id / "kernels"

    if not kernels_dir.exists() or not any(f.suffix == ".py" for f in kernels_dir.iterdir()):
        raise ValueError(
            f"Kernels for competition {competition_id} not found. Run `mlebench download-kernels -c {competition_id}` or `mlebench download-kernels --all` to download."
        )

    kernel_files = [str(f.resolve()) for f in kernels_dir.glob("*.py")]
    assert len(kernel_files) > 0, f"No kernel files found for competition {competition_id}"

    code_path = Path(code_path).resolve()

    if code_path.is_dir():
        agent_files = list(code_path.rglob("*.py"))
        logger.info(f"Found {len(agent_files)} Python files in {code_path}")
    else:
        agent_files = [code_path] if code_path.suffix == ".py" else []

    assert agent_files, f"No Python files found for submission {code_path}"

    agent_files = [str(f) for f in agent_files]
    logger.info(f"Total agent files to process: {len(agent_files)}")

    if not Path(code_path).exists():
        logger.warning(f"Submission file not found: {code_path}")
        return

    # Combine all files for comparison
    all_files = kernel_files + agent_files

    logger.info(f"Comparing {len(all_files)} files")
    logger.info(f"Agent files: {agent_files}")
    logger.info(f"Kernel files: {kernel_files}")

    # Run the plagiarism detector with a timeout
    cmd = ["node", "dolos_wrapper.mjs", "--files"] + all_files
    start_time = time.time()
    try:
        result = subprocess.run(cmd, cwd=dolos_dir, capture_output=True, text=True, timeout=timeout)

        if result.returncode == 0:
            logger.info(f"Plagiarism detection completed successfully for {code_path}")
            try:
                dolos_report = json.loads(result.stdout)
                plagiarism_report = process_dolos_report(dolos_report, Path(code_path).parent)
                return code_path, plagiarism_report
            except json.JSONDecodeError:
                logger.error(f"Error parsing plagiarism detection output for {code_path}")
                logger.error(result.stdout)
        else:
            logger.error(f"Error running plagiarism detection for {code_path}:")
            logger.error(result.stdout)
            logger.error(result.stderr)

    except TimeoutExpired:
        elapsed_time = time.time() - start_time
        logger.warning(
            f"Plagiarism detection timed out after {elapsed_time:.2f} seconds for {code_path}"
        )

    return None, None