def check_doc_files()

in scripts/in_container/run_provider_yaml_files_check.py [0:0]


def check_doc_files(yaml_files: dict[str, dict]) -> tuple[int, int]:
    num_docs = 0
    num_errors = 0
    current_doc_urls: list[str] = []
    current_logo_urls: list[str] = []
    for provider in yaml_files.values():
        if "integrations" in provider:
            current_doc_urls.extend(
                guide
                for guides in provider["integrations"]
                if "how-to-guide" in guides
                for guide in guides["how-to-guide"]
            )
            current_logo_urls.extend(
                integration["logo"] for integration in provider["integrations"] if "logo" in integration
            )
        if "transfers" in provider:
            current_doc_urls.extend(
                op["how-to-guide"] for op in provider["transfers"] if "how-to-guide" in op
            )
    if suspended_providers:
        console.print("[yellow]Suspended/Removed providers:[/]")
        console.print(suspended_providers)

    expected_doc_files = itertools.chain(
        AIRFLOW_DOCS_PATH.glob("apache-airflow-providers-*/operators/**/*.rst"),
        AIRFLOW_DOCS_PATH.glob("apache-airflow-providers-*/transfer/**/*.rst"),
    )

    expected_doc_urls = {
        f"/docs/{f.relative_to(AIRFLOW_DOCS_PATH).as_posix()}"
        for f in expected_doc_files
        if f.name != "index.rst"
        and "_partials" not in f.parts
        and not f.relative_to(AIRFLOW_DOCS_PATH).as_posix().startswith(tuple(suspended_providers))
    } | {
        f"/docs/{f.relative_to(AIRFLOW_DOCS_PATH).as_posix()}"
        for f in AIRFLOW_DOCS_PATH.glob("apache-airflow-providers-*/operators.rst")
        if not f.relative_to(AIRFLOW_DOCS_PATH).as_posix().startswith(tuple(suspended_providers))
    }
    if suspended_logos:
        console.print("[yellow]Suspended logos:[/]")
        console.print(suspended_logos)
        console.print()
    expected_logo_urls = {
        f"/{f.relative_to(AIRFLOW_DOCS_PATH).as_posix()}"
        for f in (AIRFLOW_DOCS_PATH / "integration-logos").rglob("*")
        if f.is_file()
        and not f"/{f.relative_to(AIRFLOW_DOCS_PATH).as_posix()}".startswith(tuple(suspended_logos))
    }

    try:
        console.print("Checking document urls")
        assert_sets_equal(
            set(expected_doc_urls),
            "Document urls found in airflow/docs",
            set(current_doc_urls),
            "Document urls configured in provider.yaml files",
        )
        console.print(f"Checked {len(current_doc_urls)} doc urls")
        console.print()
        console.print("Checking logo urls")
        assert_sets_equal(
            set(expected_logo_urls),
            "Logo urls found in airflow/docs/integration-logos",
            set(current_logo_urls),
            "Logo urls configured in provider.yaml files",
        )
        console.print(f"Checked {len(current_logo_urls)} logo urls")
        console.print()
    except AssertionError as ex:
        nested_error = textwrap.indent(str(ex), "  ")
        errors.append(
            f"Discrepancies between documentation/logos for providers and provider.yaml files "
            f"[yellow]How to fix it[/]: Please synchronize the docs/logs.\n{nested_error}"
        )
        num_errors += 1
    return num_docs, num_errors