#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from functools import total_ordering
import itertools
import os
import re

all_modules = []


@total_ordering
class Module(object):
    """
    A module is the basic abstraction in our test runner script. Each module consists of a set
    of source files, a set of test commands, and a set of dependencies on other modules. We use
    modules to define a dependency graph that let us determine which tests to run based on which
    files have changed.
    """

    def __init__(
        self,
        name,
        dependencies,
        source_file_regexes,
        build_profile_flags=(),
        environ=None,
        sbt_test_goals=(),
        python_test_goals=(),
        excluded_python_implementations=(),
        test_tags=(),
        should_run_r_tests=False,
        should_run_build_tests=False,
    ):
        """
        Define a new module.

        :param name: A short module name, for display in logging and error messages.
        :param dependencies: A set of dependencies for this module. This should only include direct
            dependencies; transitive dependencies are resolved automatically.
        :param source_file_regexes: a set of regexes that match source files belonging to this
            module. These regexes are applied by attempting to match at the beginning of the
            filename strings.
        :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
            order to build and test this module (e.g. '-PprofileName').
        :param environ: A dict of environment variables that should be set when files in this
            module are changed.
        :param sbt_test_goals: A set of SBT test goals for testing this module.
        :param python_test_goals: A set of Python test goals for testing this module.
        :param excluded_python_implementations: A set of Python implementations that are not
            supported by this module's Python components. The values in this set should match
            strings returned by Python's `platform.python_implementation()`.
        :param test_tags A set of tags that will be excluded when running unit tests if the module
            is not explicitly changed.
        :param should_run_r_tests: If true, changes in this module will trigger all R tests.
        :param should_run_build_tests: If true, changes in this module will trigger build tests.
        """
        self.name = name
        self.dependencies = dependencies
        self.source_file_prefixes = source_file_regexes
        self.sbt_test_goals = sbt_test_goals
        self.build_profile_flags = build_profile_flags
        self.environ = environ or {}
        self.python_test_goals = python_test_goals
        self.excluded_python_implementations = excluded_python_implementations
        self.test_tags = test_tags
        self.should_run_r_tests = should_run_r_tests
        self.should_run_build_tests = should_run_build_tests

        self.dependent_modules = set()
        for dep in dependencies:
            dep.dependent_modules.add(self)
        all_modules.append(self)

    def contains_file(self, filename):
        return any(re.match(p, filename) for p in self.source_file_prefixes)

    def __repr__(self):
        return "Module<%s>" % self.name

    def __lt__(self, other):
        return self.name < other.name

    def __eq__(self, other):
        return self.name == other.name

    def __ne__(self, other):
        return not (self.name == other.name)

    def __hash__(self):
        return hash(self.name)


tags = Module(
    name="tags",
    dependencies=[],
    source_file_regexes=[
        "common/tags/",
    ],
)

utils = Module(
    name="utils",
    dependencies=[tags],
    source_file_regexes=[
        "common/utils/",
    ],
    sbt_test_goals=[
        "common-utils/test",
    ],
)

kvstore = Module(
    name="kvstore",
    dependencies=[tags],
    source_file_regexes=[
        "common/kvstore/",
    ],
    sbt_test_goals=[
        "kvstore/test",
    ],
)

network_common = Module(
    name="network-common",
    dependencies=[tags, utils],
    source_file_regexes=[
        "common/network-common/",
    ],
    sbt_test_goals=[
        "network-common/test",
    ],
)

network_shuffle = Module(
    name="network-shuffle",
    dependencies=[tags],
    source_file_regexes=[
        "common/network-shuffle/",
    ],
    sbt_test_goals=[
        "network-shuffle/test",
    ],
)

unsafe = Module(
    name="unsafe",
    dependencies=[tags, utils],
    source_file_regexes=[
        "common/unsafe",
    ],
    sbt_test_goals=[
        "unsafe/test",
    ],
)

launcher = Module(
    name="launcher",
    dependencies=[tags],
    source_file_regexes=[
        "launcher/",
    ],
    sbt_test_goals=[
        "launcher/test",
    ],
)

sketch = Module(
    name="sketch",
    dependencies=[tags],
    source_file_regexes=[
        "common/sketch/",
    ],
    sbt_test_goals=["sketch/test"],
)

variant = Module(
    name="variant",
    dependencies=[tags],
    source_file_regexes=[
        "common/variant/",
    ],
    sbt_test_goals=["variant/test"],
)

core = Module(
    name="core",
    dependencies=[kvstore, network_common, network_shuffle, unsafe, launcher, utils],
    source_file_regexes=[
        "core/",
    ],
    sbt_test_goals=[
        "core/test",
    ],
)

api = Module(
    name="api",
    dependencies=[utils, unsafe],
    source_file_regexes=[
        "sql/api/",
    ],
)

catalyst = Module(
    name="catalyst",
    dependencies=[tags, sketch, variant, core, api],
    source_file_regexes=[
        "sql/catalyst/",
    ],
    sbt_test_goals=[
        "catalyst/test",
    ],
    environ=None
    if "GITHUB_ACTIONS" not in os.environ
    else {"ENABLE_DOCKER_INTEGRATION_TESTS": "1"},
)

sql = Module(
    name="sql",
    dependencies=[catalyst],
    source_file_regexes=[
        "sql/core/",
        "python/pyspark/sql/worker/",  # analyze_udtf is invoked and tested in JVM
    ],
    sbt_test_goals=[
        "sql/test",
    ],
    environ=None
    if "GITHUB_ACTIONS" not in os.environ
    else {"ENABLE_DOCKER_INTEGRATION_TESTS": "1"},
)

hive = Module(
    name="hive",
    dependencies=[sql],
    source_file_regexes=[
        "sql/hive/",
        "bin/spark-sql",
    ],
    build_profile_flags=[
        "-Phive",
    ],
    sbt_test_goals=[
        "hive/test",
    ],
    test_tags=["org.apache.spark.tags.ExtendedHiveTest"],
)

repl = Module(
    name="repl",
    dependencies=[hive],
    source_file_regexes=[
        "repl/",
    ],
    sbt_test_goals=[
        "repl/test",
    ],
)

hive_thriftserver = Module(
    name="hive-thriftserver",
    dependencies=[hive],
    source_file_regexes=[
        "sql/hive-thriftserver",
        "sbin/start-thriftserver.sh",
    ],
    build_profile_flags=[
        "-Phive-thriftserver",
    ],
    sbt_test_goals=[
        "hive-thriftserver/test",
    ],
)

avro = Module(
    name="avro",
    dependencies=[sql],
    source_file_regexes=[
        "connector/avro",
    ],
    sbt_test_goals=[
        "avro/test",
    ],
)

sql_kafka = Module(
    name="sql-kafka-0-10",
    dependencies=[sql],
    source_file_regexes=[
        "connector/kafka-0-10-sql",
    ],
    sbt_test_goals=[
        "sql-kafka-0-10/test",
    ],
)

profiler = Module(
    name="profiler",
    dependencies=[],
    build_profile_flags=["-Pjvm-profiler"],
    source_file_regexes=[
        "connector/profiler",
    ],
)

protobuf = Module(
    name="protobuf",
    dependencies=[sql],
    source_file_regexes=[
        "connector/protobuf",
    ],
    sbt_test_goals=[
        "protobuf/test",
    ],
)

graphx = Module(
    name="graphx",
    dependencies=[tags, core],
    source_file_regexes=[
        "graphx/",
    ],
    sbt_test_goals=["graphx/test"],
)

streaming = Module(
    name="streaming",
    dependencies=[tags, core],
    source_file_regexes=[
        "streaming",
    ],
    sbt_test_goals=[
        "streaming/test",
    ],
)


# Don't set the dependencies because changes in other modules should not trigger Kinesis tests.
# Kinesis tests depends on external Amazon kinesis service. We should run these tests only when
# files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't
# fail other PRs.
streaming_kinesis_asl = Module(
    name="streaming-kinesis-asl",
    dependencies=[tags, core],
    source_file_regexes=[
        "connector/kinesis-asl/",
        "connector/kinesis-asl-assembly/",
    ],
    build_profile_flags=[
        "-Pkinesis-asl",
    ],
    environ={"ENABLE_KINESIS_TESTS": "0"},
    sbt_test_goals=[
        "streaming-kinesis-asl/test",
    ],
)


streaming_kafka_0_10 = Module(
    name="streaming-kafka-0-10",
    dependencies=[streaming, core],
    source_file_regexes=[
        # The ending "/" is necessary otherwise it will include "sql-kafka" codes
        "connector/kafka-0-10/",
        "connector/kafka-0-10-assembly",
        "connector/kafka-0-10-token-provider",
    ],
    sbt_test_goals=["streaming-kafka-0-10/test", "token-provider-kafka-0-10/test"],
)


mllib_local = Module(
    name="mllib-local",
    dependencies=[tags, core],
    source_file_regexes=[
        "mllib-local",
    ],
    sbt_test_goals=[
        "mllib-local/test",
    ],
)


mllib = Module(
    name="mllib",
    dependencies=[mllib_local, streaming, sql],
    source_file_regexes=[
        "data/mllib/",
        "mllib/",
    ],
    sbt_test_goals=[
        "mllib/test",
    ],
)

connect = Module(
    name="connect",
    dependencies=[hive, avro, protobuf, mllib],
    source_file_regexes=[
        "sql/connect",
    ],
    sbt_test_goals=[
        "connect/test",
        "connect-client-jvm/test",
    ],
)

examples = Module(
    name="examples",
    dependencies=[graphx, mllib, streaming, hive],
    source_file_regexes=[
        "examples/",
    ],
    sbt_test_goals=[
        "examples/test",
    ],
)

pyspark_core = Module(
    name="pyspark-core",
    dependencies=[core],
    source_file_regexes=["python/(?!pyspark/(ml|mllib|sql|streaming|pandas|resource|testing))"],
    python_test_goals=[
        # doctests
        "pyspark.conf",
        "pyspark.core.rdd",
        "pyspark.core.context",
        "pyspark.core.broadcast",
        "pyspark.accumulators",
        "pyspark.core.files",
        "pyspark.serializers",
        "pyspark.profiler",
        "pyspark.shuffle",
        "pyspark.taskcontext",
        "pyspark.util",
        # unittests
        "pyspark.tests.test_appsubmit",
        "pyspark.tests.test_broadcast",
        "pyspark.tests.test_conf",
        "pyspark.tests.test_context",
        "pyspark.tests.test_daemon",
        "pyspark.tests.test_install_spark",
        "pyspark.tests.test_join",
        "pyspark.tests.test_memory_profiler",
        "pyspark.tests.test_profiler",
        "pyspark.tests.test_rdd",
        "pyspark.tests.test_rddbarrier",
        "pyspark.tests.test_rddsampler",
        "pyspark.tests.test_readwrite",
        "pyspark.tests.test_serializers",
        "pyspark.tests.test_shuffle",
        "pyspark.tests.test_statcounter",
        "pyspark.tests.test_taskcontext",
        "pyspark.tests.test_util",
        "pyspark.tests.test_worker",
        "pyspark.tests.test_stage_sched",
    ],
)

pyspark_sql = Module(
    name="pyspark-sql",
    dependencies=[pyspark_core, hive, avro, protobuf],
    source_file_regexes=["python/pyspark/sql"],
    python_test_goals=[
        # doctests
        "pyspark.sql.types",
        "pyspark.sql.context",
        "pyspark.sql.session",
        "pyspark.sql.conf",
        "pyspark.sql.catalog",
        "pyspark.sql.classic.column",
        "pyspark.sql.classic.dataframe",
        "pyspark.sql.classic.window",
        "pyspark.sql.datasource",
        "pyspark.sql.group",
        "pyspark.sql.functions.builtin",
        "pyspark.sql.functions.partitioning",
        "pyspark.sql.merge",
        "pyspark.sql.readwriter",
        "pyspark.sql.streaming.query",
        "pyspark.sql.streaming.readwriter",
        "pyspark.sql.streaming.listener",
        "pyspark.sql.udf",
        "pyspark.sql.udtf",
        "pyspark.sql.avro.functions",
        "pyspark.sql.protobuf.functions",
        "pyspark.sql.pandas.conversion",
        "pyspark.sql.pandas.map_ops",
        "pyspark.sql.pandas.group_ops",
        "pyspark.sql.pandas.types",
        "pyspark.sql.pandas.serializers",
        "pyspark.sql.pandas.typehints",
        "pyspark.sql.pandas.utils",
        "pyspark.sql.observation",
        "pyspark.sql.tvf",
        # unittests
        "pyspark.sql.tests.test_artifact",
        "pyspark.sql.tests.test_catalog",
        "pyspark.sql.tests.test_column",
        "pyspark.sql.tests.test_conf",
        "pyspark.sql.tests.test_context",
        "pyspark.sql.tests.test_dataframe",
        "pyspark.sql.tests.test_collection",
        "pyspark.sql.tests.test_creation",
        "pyspark.sql.tests.test_listener",
        "pyspark.sql.tests.test_observation",
        "pyspark.sql.tests.test_repartition",
        "pyspark.sql.tests.test_stat",
        "pyspark.sql.tests.test_datasources",
        "pyspark.sql.tests.test_errors",
        "pyspark.sql.tests.test_functions",
        "pyspark.sql.tests.test_group",
        "pyspark.sql.tests.test_sql",
        "pyspark.sql.tests.arrow.test_arrow",
        "pyspark.sql.tests.arrow.test_arrow_map",
        "pyspark.sql.tests.arrow.test_arrow_cogrouped_map",
        "pyspark.sql.tests.arrow.test_arrow_grouped_map",
        "pyspark.sql.tests.arrow.test_arrow_python_udf",
        "pyspark.sql.tests.pandas.test_pandas_cogrouped_map",
        "pyspark.sql.tests.pandas.test_pandas_grouped_map",
        "pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state",
        "pyspark.sql.tests.pandas.test_pandas_map",
        "pyspark.sql.tests.pandas.test_pandas_transform_with_state",
        "pyspark.sql.tests.pandas.test_pandas_udf",
        "pyspark.sql.tests.pandas.test_pandas_udf_grouped_agg",
        "pyspark.sql.tests.pandas.test_pandas_udf_scalar",
        "pyspark.sql.tests.pandas.test_pandas_udf_typehints",
        "pyspark.sql.tests.pandas.test_pandas_udf_typehints_with_future_annotations",
        "pyspark.sql.tests.pandas.test_pandas_udf_window",
        "pyspark.sql.tests.pandas.test_pandas_sqlmetrics",
        "pyspark.sql.tests.pandas.test_converter",
        "pyspark.sql.tests.test_python_datasource",
        "pyspark.sql.tests.test_python_streaming_datasource",
        "pyspark.sql.tests.test_readwriter",
        "pyspark.sql.tests.test_serde",
        "pyspark.sql.tests.test_session",
        "pyspark.sql.tests.streaming.test_streaming",
        "pyspark.sql.tests.streaming.test_streaming_foreach",
        "pyspark.sql.tests.streaming.test_streaming_foreach_batch",
        "pyspark.sql.tests.streaming.test_streaming_listener",
        "pyspark.sql.tests.test_subquery",
        "pyspark.sql.tests.test_types",
        "pyspark.sql.tests.test_udf",
        "pyspark.sql.tests.test_udf_profiler",
        "pyspark.sql.tests.test_udtf",
        "pyspark.sql.tests.test_tvf",
        "pyspark.sql.tests.test_utils",
        "pyspark.sql.tests.test_resources",
        "pyspark.sql.tests.plot.test_frame_plot",
        "pyspark.sql.tests.plot.test_frame_plot_plotly",
        "pyspark.sql.tests.test_connect_compatibility",
    ],
)

pyspark_testing = Module(
    name="pyspark-testing",
    dependencies=[pyspark_core, pyspark_sql],
    source_file_regexes=["python/pyspark/testing"],
    python_test_goals=[
        # doctests
        "pyspark.testing.utils",
        "pyspark.testing.pandasutils",
    ],
)

pyspark_resource = Module(
    name="pyspark-resource",
    dependencies=[pyspark_core],
    source_file_regexes=["python/pyspark/resource"],
    python_test_goals=[
        # doctests
        "pyspark.resource.profile",
        # unittests
        "pyspark.resource.tests.test_resources",
        "pyspark.resource.tests.test_connect_resources",
    ],
)


pyspark_streaming = Module(
    name="pyspark-streaming",
    dependencies=[pyspark_core, streaming, streaming_kinesis_asl],
    source_file_regexes=["python/pyspark/streaming"],
    python_test_goals=[
        # doctests
        "pyspark.streaming.util",
        # unittests
        "pyspark.streaming.tests.test_context",
        "pyspark.streaming.tests.test_dstream",
        "pyspark.streaming.tests.test_kinesis",
        "pyspark.streaming.tests.test_listener",
    ],
)


pyspark_mllib = Module(
    name="pyspark-mllib",
    dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib],
    source_file_regexes=["python/pyspark/mllib"],
    python_test_goals=[
        # doctests
        "pyspark.mllib.classification",
        "pyspark.mllib.clustering",
        "pyspark.mllib.evaluation",
        "pyspark.mllib.feature",
        "pyspark.mllib.fpm",
        "pyspark.mllib.linalg.__init__",
        "pyspark.mllib.linalg.distributed",
        "pyspark.mllib.random",
        "pyspark.mllib.recommendation",
        "pyspark.mllib.regression",
        "pyspark.mllib.stat._statistics",
        "pyspark.mllib.stat.KernelDensity",
        "pyspark.mllib.tree",
        "pyspark.mllib.util",
        # unittests
        "pyspark.mllib.tests.test_algorithms",
        "pyspark.mllib.tests.test_feature",
        "pyspark.mllib.tests.test_linalg",
        "pyspark.mllib.tests.test_stat",
        "pyspark.mllib.tests.test_streaming_algorithms",
        "pyspark.mllib.tests.test_util",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
    ],
)


pyspark_ml = Module(
    name="pyspark-ml",
    dependencies=[pyspark_core, pyspark_mllib],
    source_file_regexes=["python/pyspark/ml/"],
    python_test_goals=[
        # doctests
        "pyspark.ml.classification",
        "pyspark.ml.clustering",
        "pyspark.ml.evaluation",
        "pyspark.ml.feature",
        "pyspark.ml.fpm",
        "pyspark.ml.functions",
        "pyspark.ml.image",
        "pyspark.ml.linalg.__init__",
        "pyspark.ml.recommendation",
        "pyspark.ml.regression",
        "pyspark.ml.stat",
        "pyspark.ml.tuning",
        # unittests
        "pyspark.ml.tests.test_algorithms",
        "pyspark.ml.tests.test_als",
        "pyspark.ml.tests.test_fpm",
        "pyspark.ml.tests.test_base",
        "pyspark.ml.tests.test_evaluation",
        "pyspark.ml.tests.test_feature",
        "pyspark.ml.tests.test_functions",
        "pyspark.ml.tests.test_image",
        "pyspark.ml.tests.test_linalg",
        "pyspark.ml.tests.test_model_cache",
        "pyspark.ml.tests.test_param",
        "pyspark.ml.tests.test_persistence",
        "pyspark.ml.tests.test_pipeline",
        "pyspark.ml.tests.test_tuning",
        "pyspark.ml.tests.test_ovr",
        "pyspark.ml.tests.test_stat",
        "pyspark.ml.tests.test_training_summary",
        "pyspark.ml.tests.tuning.test_tuning",
        "pyspark.ml.tests.tuning.test_cv_io_basic",
        "pyspark.ml.tests.tuning.test_cv_io_nested",
        "pyspark.ml.tests.tuning.test_cv_io_pipeline",
        "pyspark.ml.tests.tuning.test_tvs_io_basic",
        "pyspark.ml.tests.tuning.test_tvs_io_nested",
        "pyspark.ml.tests.tuning.test_tvs_io_pipeline",
        "pyspark.ml.tests.test_util",
        "pyspark.ml.tests.test_wrapper",
        "pyspark.ml.torch.tests.test_distributor",
        "pyspark.ml.torch.tests.test_log_communication",
        "pyspark.ml.torch.tests.test_data_loader",
        "pyspark.ml.deepspeed.tests.test_deepspeed_distributor",
        "pyspark.ml.tests.connect.test_legacy_mode_summarizer",
        "pyspark.ml.tests.connect.test_legacy_mode_evaluation",
        "pyspark.ml.tests.connect.test_legacy_mode_feature",
        "pyspark.ml.tests.connect.test_legacy_mode_classification",
        "pyspark.ml.tests.connect.test_legacy_mode_pipeline",
        "pyspark.ml.tests.connect.test_legacy_mode_tuning",
        "pyspark.ml.tests.test_classification",
        "pyspark.ml.tests.test_regression",
        "pyspark.ml.tests.test_clustering",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
    ],
)

pyspark_pandas = Module(
    name="pyspark-pandas",
    dependencies=[pyspark_core, pyspark_sql],
    source_file_regexes=["python/pyspark/pandas/"],
    python_test_goals=[
        # doctests
        "pyspark.pandas.accessors",
        "pyspark.pandas.base",
        "pyspark.pandas.categorical",
        "pyspark.pandas.config",
        "pyspark.pandas.datetimes",
        "pyspark.pandas.exceptions",
        "pyspark.pandas.extensions",
        "pyspark.pandas.groupby",
        "pyspark.pandas.indexing",
        "pyspark.pandas.internal",
        "pyspark.pandas.mlflow",
        "pyspark.pandas.namespace",
        "pyspark.pandas.numpy_compat",
        "pyspark.pandas.sql_processor",
        "pyspark.pandas.sql_formatter",
        "pyspark.pandas.strings",
        "pyspark.pandas.supported_api_gen",
        "pyspark.pandas.utils",
        "pyspark.pandas.window",
        "pyspark.pandas.indexes.base",
        "pyspark.pandas.indexes.category",
        "pyspark.pandas.indexes.datetimes",
        "pyspark.pandas.indexes.timedelta",
        "pyspark.pandas.indexes.multi",
        "pyspark.pandas.spark.accessors",
        "pyspark.pandas.spark.utils",
        "pyspark.pandas.typedef.typehints",
        # unittests
        "pyspark.pandas.tests.test_categorical",
        "pyspark.pandas.tests.test_config",
        "pyspark.pandas.tests.test_extension",
        "pyspark.pandas.tests.test_frame_spark",
        "pyspark.pandas.tests.test_generic_functions",
        "pyspark.pandas.tests.test_indexops_spark",
        "pyspark.pandas.tests.test_internal",
        "pyspark.pandas.tests.test_namespace",
        "pyspark.pandas.tests.test_numpy_compat",
        "pyspark.pandas.tests.test_repr",
        "pyspark.pandas.tests.test_spark_functions",
        "pyspark.pandas.tests.test_scalars",
        "pyspark.pandas.tests.test_sql",
        "pyspark.pandas.tests.test_typedef",
        "pyspark.pandas.tests.test_utils",
        "pyspark.pandas.tests.computation.test_any_all",
        "pyspark.pandas.tests.computation.test_apply_func",
        "pyspark.pandas.tests.computation.test_binary_ops",
        "pyspark.pandas.tests.computation.test_combine",
        "pyspark.pandas.tests.computation.test_compute",
        "pyspark.pandas.tests.computation.test_corr",
        "pyspark.pandas.tests.computation.test_corrwith",
        "pyspark.pandas.tests.computation.test_cov",
        "pyspark.pandas.tests.computation.test_cumulative",
        "pyspark.pandas.tests.computation.test_describe",
        "pyspark.pandas.tests.computation.test_eval",
        "pyspark.pandas.tests.computation.test_melt",
        "pyspark.pandas.tests.computation.test_missing_data",
        "pyspark.pandas.tests.computation.test_pivot",
        "pyspark.pandas.tests.computation.test_pivot_table",
        "pyspark.pandas.tests.computation.test_pivot_table_adv",
        "pyspark.pandas.tests.computation.test_pivot_table_multi_idx",
        "pyspark.pandas.tests.computation.test_pivot_table_multi_idx_adv",
        "pyspark.pandas.tests.computation.test_stats",
        "pyspark.pandas.tests.data_type_ops.test_as_type",
        "pyspark.pandas.tests.data_type_ops.test_base",
        "pyspark.pandas.tests.data_type_ops.test_binary_ops",
        "pyspark.pandas.tests.data_type_ops.test_boolean_ops",
        "pyspark.pandas.tests.data_type_ops.test_categorical_ops",
        "pyspark.pandas.tests.data_type_ops.test_complex_ops",
        "pyspark.pandas.tests.data_type_ops.test_date_ops",
        "pyspark.pandas.tests.data_type_ops.test_datetime_ops",
        "pyspark.pandas.tests.data_type_ops.test_null_ops",
        "pyspark.pandas.tests.data_type_ops.test_num_ops",
        "pyspark.pandas.tests.data_type_ops.test_num_arithmetic",
        "pyspark.pandas.tests.data_type_ops.test_num_mod",
        "pyspark.pandas.tests.data_type_ops.test_num_mul_div",
        "pyspark.pandas.tests.data_type_ops.test_num_pow",
        "pyspark.pandas.tests.data_type_ops.test_num_reverse",
        "pyspark.pandas.tests.data_type_ops.test_string_ops",
        "pyspark.pandas.tests.data_type_ops.test_udt_ops",
        "pyspark.pandas.tests.data_type_ops.test_timedelta_ops",
        "pyspark.pandas.tests.plot.test_frame_plot",
        "pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
        "pyspark.pandas.tests.plot.test_frame_plot_plotly",
        "pyspark.pandas.tests.plot.test_series_plot",
        "pyspark.pandas.tests.plot.test_series_plot_matplotlib",
        "pyspark.pandas.tests.plot.test_series_plot_plotly",
        "pyspark.pandas.tests.frame.test_interpolate",
        "pyspark.pandas.tests.frame.test_interpolate_error",
        "pyspark.pandas.tests.frame.test_attrs",
        "pyspark.pandas.tests.frame.test_axis",
        "pyspark.pandas.tests.frame.test_constructor",
        "pyspark.pandas.tests.frame.test_conversion",
        "pyspark.pandas.tests.frame.test_reindexing",
        "pyspark.pandas.tests.frame.test_reshaping",
        "pyspark.pandas.tests.frame.test_spark",
        "pyspark.pandas.tests.frame.test_take",
        "pyspark.pandas.tests.frame.test_take_adv",
        "pyspark.pandas.tests.frame.test_time_series",
        "pyspark.pandas.tests.frame.test_truncate",
        "pyspark.pandas.tests.series.test_interpolate",
        "pyspark.pandas.tests.resample.test_on",
        "pyspark.pandas.tests.resample.test_error",
        "pyspark.pandas.tests.resample.test_frame",
        "pyspark.pandas.tests.resample.test_missing",
        "pyspark.pandas.tests.resample.test_series",
        "pyspark.pandas.tests.resample.test_timezone",
        "pyspark.pandas.tests.reshape.test_get_dummies",
        "pyspark.pandas.tests.reshape.test_get_dummies_kwargs",
        "pyspark.pandas.tests.reshape.test_get_dummies_multiindex",
        "pyspark.pandas.tests.reshape.test_get_dummies_object",
        "pyspark.pandas.tests.reshape.test_get_dummies_prefix",
        "pyspark.pandas.tests.reshape.test_merge_asof",
        "pyspark.pandas.tests.window.test_expanding",
        "pyspark.pandas.tests.window.test_expanding_adv",
        "pyspark.pandas.tests.window.test_expanding_error",
        "pyspark.pandas.tests.window.test_groupby_expanding",
        "pyspark.pandas.tests.window.test_groupby_expanding_adv",
        "pyspark.pandas.tests.window.test_ewm_error",
        "pyspark.pandas.tests.window.test_ewm_mean",
        "pyspark.pandas.tests.window.test_groupby_ewm_mean",
        "pyspark.pandas.tests.window.test_missing",
        "pyspark.pandas.tests.window.test_rolling",
        "pyspark.pandas.tests.window.test_rolling_adv",
        "pyspark.pandas.tests.window.test_rolling_count",
        "pyspark.pandas.tests.window.test_rolling_error",
        "pyspark.pandas.tests.window.test_groupby_rolling",
        "pyspark.pandas.tests.window.test_groupby_rolling_adv",
        "pyspark.pandas.tests.window.test_groupby_rolling_count",
        "pyspark.pandas.tests.series.test_datetime",
        "pyspark.pandas.tests.series.test_string_ops_adv",
        "pyspark.pandas.tests.series.test_string_ops_basic",
        "pyspark.pandas.tests.series.test_all_any",
        "pyspark.pandas.tests.series.test_arg_ops",
        "pyspark.pandas.tests.series.test_as_of",
        "pyspark.pandas.tests.series.test_as_type",
        "pyspark.pandas.tests.series.test_compute",
        "pyspark.pandas.tests.series.test_conversion",
        "pyspark.pandas.tests.series.test_cumulative",
        "pyspark.pandas.tests.series.test_index",
        "pyspark.pandas.tests.series.test_missing_data",
        "pyspark.pandas.tests.series.test_series",
        "pyspark.pandas.tests.series.test_sort",
        "pyspark.pandas.tests.series.test_stat",
        "pyspark.pandas.tests.io.test_io",
        "pyspark.pandas.tests.io.test_csv",
        "pyspark.pandas.tests.io.test_feather",
        "pyspark.pandas.tests.io.test_stata",
        "pyspark.pandas.tests.io.test_dataframe_conversion",
        "pyspark.pandas.tests.io.test_dataframe_spark_io",
        "pyspark.pandas.tests.io.test_series_conversion",
        # fallback
        "pyspark.pandas.tests.frame.test_asfreq",
        "pyspark.pandas.tests.frame.test_asof",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
        # they aren't available there
    ],
)

pyspark_pandas_slow = Module(
    name="pyspark-pandas-slow",
    dependencies=[pyspark_core, pyspark_sql],
    source_file_regexes=["python/pyspark/pandas/"],
    python_test_goals=[
        # doctests
        "pyspark.pandas.frame",
        "pyspark.pandas.generic",
        "pyspark.pandas.series",
        # unittests
        "pyspark.pandas.tests.indexes.test_default",
        "pyspark.pandas.tests.indexes.test_category",
        "pyspark.pandas.tests.indexes.test_timedelta",
        "pyspark.pandas.tests.indexes.test_basic",
        "pyspark.pandas.tests.indexes.test_getattr",
        "pyspark.pandas.tests.indexes.test_name",
        "pyspark.pandas.tests.indexes.test_conversion",
        "pyspark.pandas.tests.indexes.test_drop",
        "pyspark.pandas.tests.indexes.test_level",
        "pyspark.pandas.tests.indexes.test_missing",
        "pyspark.pandas.tests.indexes.test_repeat",
        "pyspark.pandas.tests.indexes.test_sort",
        "pyspark.pandas.tests.indexes.test_stat",
        "pyspark.pandas.tests.indexes.test_symmetric_diff",
        "pyspark.pandas.tests.indexes.test_take",
        "pyspark.pandas.tests.indexes.test_unique",
        "pyspark.pandas.tests.indexes.test_asof",
        "pyspark.pandas.tests.indexes.test_astype",
        "pyspark.pandas.tests.indexes.test_delete",
        "pyspark.pandas.tests.indexes.test_diff",
        "pyspark.pandas.tests.indexes.test_insert",
        "pyspark.pandas.tests.indexes.test_map",
        "pyspark.pandas.tests.indexes.test_append",
        "pyspark.pandas.tests.indexes.test_intersection",
        "pyspark.pandas.tests.indexes.test_monotonic",
        "pyspark.pandas.tests.indexes.test_union",
        "pyspark.pandas.tests.indexes.test_datetime",
        "pyspark.pandas.tests.indexes.test_datetime_at",
        "pyspark.pandas.tests.indexes.test_datetime_between",
        "pyspark.pandas.tests.indexes.test_datetime_ceil",
        "pyspark.pandas.tests.indexes.test_datetime_floor",
        "pyspark.pandas.tests.indexes.test_datetime_iso",
        "pyspark.pandas.tests.indexes.test_datetime_map",
        "pyspark.pandas.tests.indexes.test_datetime_property",
        "pyspark.pandas.tests.indexes.test_datetime_round",
        "pyspark.pandas.tests.indexes.test_align",
        "pyspark.pandas.tests.indexes.test_indexing",
        "pyspark.pandas.tests.indexes.test_indexing_adv",
        "pyspark.pandas.tests.indexes.test_indexing_basic",
        "pyspark.pandas.tests.indexes.test_indexing_iloc",
        "pyspark.pandas.tests.indexes.test_indexing_loc",
        "pyspark.pandas.tests.indexes.test_indexing_loc_2d",
        "pyspark.pandas.tests.indexes.test_indexing_loc_multi_idx",
        "pyspark.pandas.tests.indexes.test_reindex",
        "pyspark.pandas.tests.indexes.test_rename",
        "pyspark.pandas.tests.indexes.test_reset_index",
        "pyspark.pandas.tests.groupby.test_aggregate",
        "pyspark.pandas.tests.groupby.test_apply_func",
        "pyspark.pandas.tests.groupby.test_corr",
        "pyspark.pandas.tests.groupby.test_cumulative",
        "pyspark.pandas.tests.groupby.test_describe",
        "pyspark.pandas.tests.groupby.test_groupby",
        "pyspark.pandas.tests.groupby.test_grouping",
        "pyspark.pandas.tests.groupby.test_head_tail",
        "pyspark.pandas.tests.groupby.test_index",
        "pyspark.pandas.tests.groupby.test_missing",
        "pyspark.pandas.tests.groupby.test_missing_data",
        "pyspark.pandas.tests.groupby.test_nlargest_nsmallest",
        "pyspark.pandas.tests.groupby.test_raises",
        "pyspark.pandas.tests.groupby.test_rank",
        "pyspark.pandas.tests.groupby.test_size",
        "pyspark.pandas.tests.groupby.test_split_apply",
        "pyspark.pandas.tests.groupby.test_split_apply_count",
        "pyspark.pandas.tests.groupby.test_split_apply_first",
        "pyspark.pandas.tests.groupby.test_split_apply_last",
        "pyspark.pandas.tests.groupby.test_split_apply_min_max",
        "pyspark.pandas.tests.groupby.test_split_apply_skew",
        "pyspark.pandas.tests.groupby.test_split_apply_std",
        "pyspark.pandas.tests.groupby.test_split_apply_var",
        "pyspark.pandas.tests.groupby.test_stat",
        "pyspark.pandas.tests.groupby.test_stat_adv",
        "pyspark.pandas.tests.groupby.test_stat_ddof",
        "pyspark.pandas.tests.groupby.test_stat_func",
        "pyspark.pandas.tests.groupby.test_stat_prod",
        "pyspark.pandas.tests.groupby.test_value_counts",
        "pyspark.pandas.tests.diff_frames_ops.test_align",
        "pyspark.pandas.tests.diff_frames_ops.test_arithmetic",
        "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_ext",
        "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_ext_float",
        "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain",
        "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain_ext",
        "pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain_ext_float",
        "pyspark.pandas.tests.diff_frames_ops.test_assign_frame",
        "pyspark.pandas.tests.diff_frames_ops.test_assign_series",
        "pyspark.pandas.tests.diff_frames_ops.test_basic",
        "pyspark.pandas.tests.diff_frames_ops.test_bitwise",
        "pyspark.pandas.tests.diff_frames_ops.test_combine_first",
        "pyspark.pandas.tests.diff_frames_ops.test_compare_series",
        "pyspark.pandas.tests.diff_frames_ops.test_concat_inner",
        "pyspark.pandas.tests.diff_frames_ops.test_concat_outer",
        "pyspark.pandas.tests.diff_frames_ops.test_basic_slow",
        "pyspark.pandas.tests.diff_frames_ops.test_cov",
        "pyspark.pandas.tests.diff_frames_ops.test_corrwith",
        "pyspark.pandas.tests.diff_frames_ops.test_dot_frame",
        "pyspark.pandas.tests.diff_frames_ops.test_dot_series",
        "pyspark.pandas.tests.diff_frames_ops.test_error",
        "pyspark.pandas.tests.diff_frames_ops.test_index",
        "pyspark.pandas.tests.diff_frames_ops.test_series",
        "pyspark.pandas.tests.diff_frames_ops.test_setitem_frame",
        "pyspark.pandas.tests.diff_frames_ops.test_setitem_series",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_aggregate",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_apply",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_cumulative",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_diff",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_diff_len",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_fillna",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_filter",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_shift",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_split_apply_combine",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_transform",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_adv",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_count",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv",
        "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
        # they aren't available there
    ],
)

pyspark_connect = Module(
    name="pyspark-connect",
    dependencies=[pyspark_sql, connect],
    source_file_regexes=[
        "python/pyspark/sql/connect",
    ],
    python_test_goals=[
        # sql doctests
        "pyspark.sql.connect.catalog",
        "pyspark.sql.connect.conf",
        "pyspark.sql.connect.group",
        "pyspark.sql.connect.session",
        "pyspark.sql.connect.window",
        "pyspark.sql.connect.column",
        "pyspark.sql.connect.merge",
        "pyspark.sql.connect.readwriter",
        "pyspark.sql.connect.dataframe",
        "pyspark.sql.connect.functions.builtin",
        "pyspark.sql.connect.functions.partitioning",
        "pyspark.sql.connect.observation",
        "pyspark.sql.connect.avro.functions",
        "pyspark.sql.connect.protobuf.functions",
        "pyspark.sql.connect.streaming.readwriter",
        "pyspark.sql.connect.streaming.query",
        "pyspark.sql.connect.tvf",
        # sql unittests
        "pyspark.sql.tests.connect.test_connect_plan",
        "pyspark.sql.tests.connect.test_connect_basic",
        "pyspark.sql.tests.connect.test_connect_dataframe_property",
        "pyspark.sql.tests.connect.test_connect_channel",
        "pyspark.sql.tests.connect.test_connect_error",
        "pyspark.sql.tests.connect.test_connect_function",
        "pyspark.sql.tests.connect.test_connect_collection",
        "pyspark.sql.tests.connect.test_connect_column",
        "pyspark.sql.tests.connect.test_connect_creation",
        "pyspark.sql.tests.connect.test_connect_readwriter",
        "pyspark.sql.tests.connect.test_connect_retry",
        "pyspark.sql.tests.connect.test_connect_session",
        "pyspark.sql.tests.connect.test_connect_stat",
        "pyspark.sql.tests.connect.test_parity_datasources",
        "pyspark.sql.tests.connect.test_parity_errors",
        "pyspark.sql.tests.connect.test_parity_catalog",
        "pyspark.sql.tests.connect.test_parity_conf",
        "pyspark.sql.tests.connect.test_parity_serde",
        "pyspark.sql.tests.connect.test_parity_functions",
        "pyspark.sql.tests.connect.test_parity_group",
        "pyspark.sql.tests.connect.test_parity_sql",
        "pyspark.sql.tests.connect.test_parity_dataframe",
        "pyspark.sql.tests.connect.test_parity_collection",
        "pyspark.sql.tests.connect.test_parity_creation",
        "pyspark.sql.tests.connect.test_parity_observation",
        "pyspark.sql.tests.connect.test_parity_repartition",
        "pyspark.sql.tests.connect.test_parity_stat",
        "pyspark.sql.tests.connect.test_parity_subquery",
        "pyspark.sql.tests.connect.test_parity_types",
        "pyspark.sql.tests.connect.test_parity_column",
        "pyspark.sql.tests.connect.test_parity_readwriter",
        "pyspark.sql.tests.connect.test_parity_udf",
        "pyspark.sql.tests.connect.test_parity_udf_profiler",
        "pyspark.sql.tests.connect.test_parity_memory_profiler",
        "pyspark.sql.tests.connect.test_parity_udtf",
        "pyspark.sql.tests.connect.test_parity_tvf",
        "pyspark.sql.tests.connect.test_parity_python_datasource",
        "pyspark.sql.tests.connect.test_parity_python_streaming_datasource",
        "pyspark.sql.tests.connect.test_parity_frame_plot",
        "pyspark.sql.tests.connect.test_parity_frame_plot_plotly",
        "pyspark.sql.tests.connect.test_utils",
        "pyspark.sql.tests.connect.client.test_artifact",
        "pyspark.sql.tests.connect.client.test_artifact_localcluster",
        "pyspark.sql.tests.connect.client.test_client",
        "pyspark.sql.tests.connect.client.test_reattach",
        "pyspark.sql.tests.connect.streaming.test_parity_streaming",
        "pyspark.sql.tests.connect.streaming.test_parity_listener",
        "pyspark.sql.tests.connect.streaming.test_parity_foreach",
        "pyspark.sql.tests.connect.streaming.test_parity_foreach_batch",
        "pyspark.sql.tests.connect.test_resources",
        "pyspark.sql.tests.connect.shell.test_progress",
        "pyspark.sql.tests.connect.test_df_debug",
        "pyspark.sql.tests.connect.arrow.test_parity_arrow",
        "pyspark.sql.tests.connect.arrow.test_parity_arrow_map",
        "pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map",
        "pyspark.sql.tests.connect.arrow.test_parity_arrow_cogrouped_map",
        "pyspark.sql.tests.connect.arrow.test_parity_arrow_python_udf",
        "pyspark.sql.tests.connect.pandas.test_parity_pandas_map",
        "pyspark.sql.tests.connect.pandas.test_parity_pandas_grouped_map",
        "pyspark.sql.tests.connect.pandas.test_parity_pandas_grouped_map_with_state",
        "pyspark.sql.tests.connect.pandas.test_parity_pandas_cogrouped_map",
        "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf",
        "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_scalar",
        "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_grouped_agg",
        "pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_window",
        "pyspark.sql.tests.connect.pandas.test_parity_pandas_transform_with_state",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
        # they aren't available there
    ],
)


pyspark_ml_connect = Module(
    name="pyspark-ml-connect",
    dependencies=[pyspark_connect, pyspark_ml],
    source_file_regexes=[
        "python/pyspark/ml/connect",
    ],
    python_test_goals=[
        # ml doctests
        "pyspark.ml.connect.functions",
        # ml unittests
        "pyspark.ml.tests.connect.test_connect_cache",
        "pyspark.ml.tests.connect.test_connect_function",
        "pyspark.ml.tests.connect.test_parity_torch_distributor",
        "pyspark.ml.tests.connect.test_parity_torch_data_loader",
        "pyspark.ml.tests.connect.test_connect_summarizer",
        "pyspark.ml.tests.connect.test_connect_evaluation",
        "pyspark.ml.tests.connect.test_connect_feature",
        "pyspark.ml.tests.connect.test_connect_classification",
        "pyspark.ml.tests.connect.test_connect_pipeline",
        "pyspark.ml.tests.connect.test_connect_tuning",
        "pyspark.ml.tests.connect.test_parity_als",
        "pyspark.ml.tests.connect.test_parity_fpm",
        "pyspark.ml.tests.connect.test_parity_classification",
        "pyspark.ml.tests.connect.test_parity_regression",
        "pyspark.ml.tests.connect.test_parity_clustering",
        "pyspark.ml.tests.connect.test_parity_evaluation",
        "pyspark.ml.tests.connect.test_parity_feature",
        "pyspark.ml.tests.connect.test_parity_functions",
        "pyspark.ml.tests.connect.test_parity_pipeline",
        "pyspark.ml.tests.connect.test_parity_tuning",
        "pyspark.ml.tests.connect.test_parity_ovr",
        "pyspark.ml.tests.connect.test_parity_stat",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
        # they aren't available there
    ],
)


pyspark_pandas_connect_part0 = Module(
    name="pyspark-pandas-connect-part0",
    dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
    source_file_regexes=[
        "python/pyspark/pandas",
    ],
    python_test_goals=[
        # unittests dedicated for Spark Connect
        "pyspark.pandas.tests.connect.test_connect_plotting",
        # pandas-on-Spark unittests
        "pyspark.pandas.tests.connect.test_parity_categorical",
        "pyspark.pandas.tests.connect.test_parity_config",
        "pyspark.pandas.tests.connect.test_parity_extension",
        "pyspark.pandas.tests.connect.test_parity_frame_spark",
        "pyspark.pandas.tests.connect.test_parity_generic_functions",
        "pyspark.pandas.tests.connect.test_parity_indexops_spark",
        "pyspark.pandas.tests.connect.test_parity_internal",
        "pyspark.pandas.tests.connect.test_parity_namespace",
        "pyspark.pandas.tests.connect.test_parity_numpy_compat",
        "pyspark.pandas.tests.connect.test_parity_repr",
        "pyspark.pandas.tests.connect.test_parity_scalars",
        "pyspark.pandas.tests.connect.test_parity_spark_functions",
        "pyspark.pandas.tests.connect.test_parity_sql",
        "pyspark.pandas.tests.connect.test_parity_typedef",
        "pyspark.pandas.tests.connect.test_parity_utils",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_as_type",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_base",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_binary_ops",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_boolean_ops",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_categorical_ops",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_complex_ops",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_date_ops",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_datetime_ops",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_null_ops",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_ops",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_reverse",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_string_ops",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_udt_ops",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_timedelta_ops",
        "pyspark.pandas.tests.connect.plot.test_parity_frame_plot",
        "pyspark.pandas.tests.connect.plot.test_parity_frame_plot_matplotlib",
        "pyspark.pandas.tests.connect.plot.test_parity_frame_plot_plotly",
        "pyspark.pandas.tests.connect.plot.test_parity_series_plot",
        "pyspark.pandas.tests.connect.plot.test_parity_series_plot_matplotlib",
        "pyspark.pandas.tests.connect.plot.test_parity_series_plot_plotly",
        "pyspark.pandas.tests.connect.indexes.test_parity_default",
        "pyspark.pandas.tests.connect.indexes.test_parity_category",
        "pyspark.pandas.tests.connect.indexes.test_parity_timedelta",
        "pyspark.pandas.tests.connect.indexes.test_parity_basic",
        "pyspark.pandas.tests.connect.indexes.test_parity_getattr",
        "pyspark.pandas.tests.connect.indexes.test_parity_name",
        "pyspark.pandas.tests.connect.indexes.test_parity_conversion",
        "pyspark.pandas.tests.connect.indexes.test_parity_drop",
        "pyspark.pandas.tests.connect.indexes.test_parity_level",
        "pyspark.pandas.tests.connect.indexes.test_parity_missing",
        "pyspark.pandas.tests.connect.indexes.test_parity_repeat",
        "pyspark.pandas.tests.connect.indexes.test_parity_sort",
        "pyspark.pandas.tests.connect.indexes.test_parity_stat",
        "pyspark.pandas.tests.connect.indexes.test_parity_symmetric_diff",
        "pyspark.pandas.tests.connect.indexes.test_parity_take",
        "pyspark.pandas.tests.connect.indexes.test_parity_unique",
        "pyspark.pandas.tests.connect.indexes.test_parity_asof",
        "pyspark.pandas.tests.connect.indexes.test_parity_astype",
        "pyspark.pandas.tests.connect.indexes.test_parity_delete",
        "pyspark.pandas.tests.connect.indexes.test_parity_diff",
        "pyspark.pandas.tests.connect.indexes.test_parity_insert",
        "pyspark.pandas.tests.connect.indexes.test_parity_map",
        "pyspark.pandas.tests.connect.indexes.test_parity_align",
        "pyspark.pandas.tests.connect.indexes.test_parity_indexing",
        "pyspark.pandas.tests.connect.indexes.test_parity_indexing_adv",
        "pyspark.pandas.tests.connect.indexes.test_parity_indexing_basic",
        "pyspark.pandas.tests.connect.indexes.test_parity_indexing_iloc",
        "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc",
        "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_2d",
        "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_multi_idx",
        "pyspark.pandas.tests.connect.indexes.test_parity_reindex",
        "pyspark.pandas.tests.connect.indexes.test_parity_rename",
        "pyspark.pandas.tests.connect.indexes.test_parity_reset_index",
        "pyspark.pandas.tests.connect.indexes.test_parity_datetime",
        "pyspark.pandas.tests.connect.indexes.test_parity_datetime_at",
        "pyspark.pandas.tests.connect.indexes.test_parity_datetime_between",
        "pyspark.pandas.tests.connect.computation.test_parity_any_all",
        "pyspark.pandas.tests.connect.computation.test_parity_apply_func",
        "pyspark.pandas.tests.connect.computation.test_parity_binary_ops",
        "pyspark.pandas.tests.connect.computation.test_parity_combine",
        "pyspark.pandas.tests.connect.computation.test_parity_compute",
        "pyspark.pandas.tests.connect.computation.test_parity_cov",
        "pyspark.pandas.tests.connect.computation.test_parity_corr",
        "pyspark.pandas.tests.connect.computation.test_parity_corrwith",
        "pyspark.pandas.tests.connect.computation.test_parity_cumulative",
        "pyspark.pandas.tests.connect.computation.test_parity_describe",
        "pyspark.pandas.tests.connect.computation.test_parity_eval",
        "pyspark.pandas.tests.connect.computation.test_parity_melt",
        "pyspark.pandas.tests.connect.computation.test_parity_missing_data",
        "pyspark.pandas.tests.connect.groupby.test_parity_stat",
        "pyspark.pandas.tests.connect.groupby.test_parity_stat_adv",
        "pyspark.pandas.tests.connect.groupby.test_parity_stat_ddof",
        "pyspark.pandas.tests.connect.groupby.test_parity_stat_func",
        "pyspark.pandas.tests.connect.groupby.test_parity_stat_prod",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
        # they aren't available there
    ],
)

pyspark_pandas_connect_part1 = Module(
    name="pyspark-pandas-connect-part1",
    dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
    source_file_regexes=[
        "python/pyspark/pandas",
    ],
    python_test_goals=[
        # pandas-on-Spark unittests
        "pyspark.pandas.tests.connect.frame.test_parity_attrs",
        "pyspark.pandas.tests.connect.frame.test_parity_axis",
        "pyspark.pandas.tests.connect.frame.test_parity_constructor",
        "pyspark.pandas.tests.connect.frame.test_parity_conversion",
        "pyspark.pandas.tests.connect.frame.test_parity_reindexing",
        "pyspark.pandas.tests.connect.frame.test_parity_reshaping",
        "pyspark.pandas.tests.connect.frame.test_parity_spark",
        "pyspark.pandas.tests.connect.frame.test_parity_take",
        "pyspark.pandas.tests.connect.frame.test_parity_take_adv",
        "pyspark.pandas.tests.connect.frame.test_parity_time_series",
        "pyspark.pandas.tests.connect.frame.test_parity_truncate",
        "pyspark.pandas.tests.connect.groupby.test_parity_aggregate",
        "pyspark.pandas.tests.connect.groupby.test_parity_apply_func",
        "pyspark.pandas.tests.connect.groupby.test_parity_corr",
        "pyspark.pandas.tests.connect.groupby.test_parity_cumulative",
        "pyspark.pandas.tests.connect.groupby.test_parity_missing_data",
        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply",
        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_count",
        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_first",
        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_last",
        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max",
        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_skew",
        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_std",
        "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_var",
        "pyspark.pandas.tests.connect.series.test_parity_datetime",
        "pyspark.pandas.tests.connect.series.test_parity_string_ops_adv",
        "pyspark.pandas.tests.connect.series.test_parity_string_ops_basic",
        "pyspark.pandas.tests.connect.series.test_parity_all_any",
        "pyspark.pandas.tests.connect.series.test_parity_arg_ops",
        "pyspark.pandas.tests.connect.series.test_parity_as_of",
        "pyspark.pandas.tests.connect.series.test_parity_as_type",
        "pyspark.pandas.tests.connect.series.test_parity_compute",
        "pyspark.pandas.tests.connect.series.test_parity_conversion",
        "pyspark.pandas.tests.connect.series.test_parity_cumulative",
        "pyspark.pandas.tests.connect.series.test_parity_index",
        "pyspark.pandas.tests.connect.series.test_parity_missing_data",
        "pyspark.pandas.tests.connect.series.test_parity_series",
        "pyspark.pandas.tests.connect.series.test_parity_sort",
        "pyspark.pandas.tests.connect.series.test_parity_stat",
        "pyspark.pandas.tests.connect.series.test_parity_interpolate",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mod",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mul_div",
        "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_pow",
        "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies",
        "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_kwargs",
        "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_multiindex",
        "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_object",
        "pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_prefix",
        "pyspark.pandas.tests.connect.reshape.test_parity_merge_asof",
        "pyspark.pandas.tests.connect.indexes.test_parity_append",
        "pyspark.pandas.tests.connect.indexes.test_parity_intersection",
        "pyspark.pandas.tests.connect.indexes.test_parity_monotonic",
        "pyspark.pandas.tests.connect.indexes.test_parity_union",
        "pyspark.pandas.tests.connect.indexes.test_parity_datetime_ceil",
        "pyspark.pandas.tests.connect.indexes.test_parity_datetime_floor",
        "pyspark.pandas.tests.connect.indexes.test_parity_datetime_iso",
        "pyspark.pandas.tests.connect.indexes.test_parity_datetime_map",
        "pyspark.pandas.tests.connect.indexes.test_parity_datetime_property",
        "pyspark.pandas.tests.connect.indexes.test_parity_datetime_round",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_shift",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_transform",
        # fallback
        "pyspark.pandas.tests.connect.frame.test_parity_asfreq",
        "pyspark.pandas.tests.connect.frame.test_parity_asof",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
        # they aren't available there
    ],
)


pyspark_pandas_connect_part2 = Module(
    name="pyspark-pandas-connect-part2",
    dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
    source_file_regexes=[
        "python/pyspark/pandas",
    ],
    python_test_goals=[
        # pandas-on-Spark unittests
        "pyspark.pandas.tests.connect.computation.test_parity_pivot",
        "pyspark.pandas.tests.connect.computation.test_parity_pivot_table",
        "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_adv",
        "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx",
        "pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx_adv",
        "pyspark.pandas.tests.connect.computation.test_parity_stats",
        "pyspark.pandas.tests.connect.frame.test_parity_interpolate",
        "pyspark.pandas.tests.connect.frame.test_parity_interpolate_error",
        "pyspark.pandas.tests.connect.resample.test_parity_frame",
        "pyspark.pandas.tests.connect.resample.test_parity_series",
        "pyspark.pandas.tests.connect.resample.test_parity_error",
        "pyspark.pandas.tests.connect.resample.test_parity_missing",
        "pyspark.pandas.tests.connect.resample.test_parity_on",
        "pyspark.pandas.tests.connect.resample.test_parity_timezone",
        "pyspark.pandas.tests.connect.window.test_parity_ewm_error",
        "pyspark.pandas.tests.connect.window.test_parity_ewm_mean",
        "pyspark.pandas.tests.connect.window.test_parity_groupby_ewm_mean",
        "pyspark.pandas.tests.connect.window.test_parity_missing",
        "pyspark.pandas.tests.connect.window.test_parity_rolling",
        "pyspark.pandas.tests.connect.window.test_parity_rolling_adv",
        "pyspark.pandas.tests.connect.window.test_parity_rolling_count",
        "pyspark.pandas.tests.connect.window.test_parity_rolling_error",
        "pyspark.pandas.tests.connect.window.test_parity_groupby_rolling",
        "pyspark.pandas.tests.connect.window.test_parity_groupby_rolling_adv",
        "pyspark.pandas.tests.connect.window.test_parity_groupby_rolling_count",
        "pyspark.pandas.tests.connect.window.test_parity_expanding",
        "pyspark.pandas.tests.connect.window.test_parity_expanding_adv",
        "pyspark.pandas.tests.connect.window.test_parity_expanding_error",
        "pyspark.pandas.tests.connect.window.test_parity_groupby_expanding",
        "pyspark.pandas.tests.connect.window.test_parity_groupby_expanding_adv",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_adv",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_count",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_frame",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_series",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_error",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_align",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic_slow",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_cov",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_corrwith",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_index",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_series",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_frame",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_series",
        "pyspark.pandas.tests.connect.groupby.test_parity_index",
        "pyspark.pandas.tests.connect.groupby.test_parity_describe",
        "pyspark.pandas.tests.connect.groupby.test_parity_head_tail",
        "pyspark.pandas.tests.connect.groupby.test_parity_groupby",
        "pyspark.pandas.tests.connect.groupby.test_parity_grouping",
        "pyspark.pandas.tests.connect.groupby.test_parity_missing",
        "pyspark.pandas.tests.connect.groupby.test_parity_nlargest_nsmallest",
        "pyspark.pandas.tests.connect.groupby.test_parity_raises",
        "pyspark.pandas.tests.connect.groupby.test_parity_rank",
        "pyspark.pandas.tests.connect.groupby.test_parity_size",
        "pyspark.pandas.tests.connect.groupby.test_parity_value_counts",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
        # they aren't available there
    ],
)


pyspark_pandas_connect_part3 = Module(
    name="pyspark-pandas-connect-part3",
    dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
    source_file_regexes=[
        "python/pyspark/pandas",
    ],
    python_test_goals=[
        # pandas-on-Spark unittests
        "pyspark.pandas.tests.connect.io.test_parity_io",
        "pyspark.pandas.tests.connect.io.test_parity_csv",
        "pyspark.pandas.tests.connect.io.test_parity_feather",
        "pyspark.pandas.tests.connect.io.test_parity_stata",
        "pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion",
        "pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io",
        "pyspark.pandas.tests.connect.io.test_parity_series_conversion",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext_float",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain_ext",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain_ext_float",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_assign_frame",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_assign_series",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_bitwise",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_combine_first",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_compare_series",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_concat_inner",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_concat_outer",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_aggregate",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_apply",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_cumulative",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_diff",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_diff_len",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_fillna",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_filter",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_split_apply_combine",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding_adv",
        "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding_count",
    ],
    excluded_python_implementations=[
        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
        # they aren't available there
    ],
)


pyspark_errors = Module(
    name="pyspark-errors",
    dependencies=[],
    source_file_regexes=[
        # SPARK-44544: Force the execution of pyspark_errors when there are any changes
        # in PySpark, since the Python Packaging Tests is only enabled within this module.
        # This module is the smallest Python test module, it contains only 1 test file
        # and normally takes < 2 seconds, so the additional cost is small.
        "python/",
        "python/pyspark/errors",
    ],
    python_test_goals=[
        # unittests
        "pyspark.errors.tests.test_connect_errors_conversion",
        "pyspark.errors.tests.test_errors",
        "pyspark.errors.tests.test_traceback",
        "pyspark.errors.tests.connect.test_parity_traceback",
    ],
)

pyspark_logger = Module(
    name="pyspark-logger",
    dependencies=[],
    source_file_regexes=["python/pyspark/logger"],
    python_test_goals=[
        # doctests
        "pyspark.logger.logger",
        # unittests
        "pyspark.logger.tests.test_logger",
        "pyspark.logger.tests.connect.test_parity_logger",
    ],
)


sparkr = Module(
    name="sparkr",
    dependencies=[hive, mllib],
    source_file_regexes=[
        "R/",
    ],
    should_run_r_tests=True,
)


docs = Module(
    name="docs",
    dependencies=[],
    source_file_regexes=[
        "docs/",
    ],
)

build = Module(
    name="build",
    dependencies=[],
    source_file_regexes=[
        ".*pom.xml",
        "dev/test-dependencies.sh",
    ],
    should_run_build_tests=True,
)

yarn = Module(
    name="yarn",
    dependencies=[],
    source_file_regexes=[
        "resource-managers/yarn/",
        "common/network-yarn/",
    ],
    build_profile_flags=["-Pyarn"],
    sbt_test_goals=[
        "yarn/test",
        "network-yarn/test",
    ],
    test_tags=["org.apache.spark.tags.ExtendedYarnTest"],
)

kubernetes = Module(
    name="kubernetes",
    dependencies=[],
    source_file_regexes=["resource-managers/kubernetes"],
    build_profile_flags=["-Pkubernetes", "-Pvolcano"],
    sbt_test_goals=["kubernetes/test"],
)

hadoop_cloud = Module(
    name="hadoop-cloud",
    dependencies=[],
    source_file_regexes=["hadoop-cloud"],
    build_profile_flags=["-Phadoop-cloud"],
    sbt_test_goals=["hadoop-cloud/test"],
)

spark_ganglia_lgpl = Module(
    name="spark-ganglia-lgpl",
    dependencies=[],
    build_profile_flags=["-Pspark-ganglia-lgpl"],
    source_file_regexes=[
        "connector/spark-ganglia-lgpl",
    ],
)

docker_integration_tests = Module(
    name="docker-integration-tests",
    dependencies=[sql],
    build_profile_flags=["-Pdocker-integration-tests"],
    source_file_regexes=["connector/docker-integration-tests"],
    sbt_test_goals=["docker-integration-tests/test"],
    environ=None
    if "GITHUB_ACTIONS" not in os.environ
    else {"ENABLE_DOCKER_INTEGRATION_TESTS": "1"},
    test_tags=["org.apache.spark.tags.DockerTest"],
)

# The root module is a dummy module which is used to run all of the tests.
# No other modules should directly depend on this module.
root = Module(
    name="root",
    dependencies=[build, core],  # Changes to build should trigger all tests.
    source_file_regexes=[],
    # In order to run all of the tests, enable every test profile:
    build_profile_flags=list(
        set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))
    ),
    sbt_test_goals=[
        "test",
    ],
    python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)),
    should_run_r_tests=True,
    should_run_build_tests=True,
)