dev/sparktestsupport/modules.py (1,409 lines of code) (raw):
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from functools import total_ordering
import itertools
import os
import re
all_modules = []
@total_ordering
class Module(object):
"""
A module is the basic abstraction in our test runner script. Each module consists of a set
of source files, a set of test commands, and a set of dependencies on other modules. We use
modules to define a dependency graph that let us determine which tests to run based on which
files have changed.
"""
def __init__(
self,
name,
dependencies,
source_file_regexes,
build_profile_flags=(),
environ=None,
sbt_test_goals=(),
python_test_goals=(),
excluded_python_implementations=(),
test_tags=(),
should_run_r_tests=False,
should_run_build_tests=False,
):
"""
Define a new module.
:param name: A short module name, for display in logging and error messages.
:param dependencies: A set of dependencies for this module. This should only include direct
dependencies; transitive dependencies are resolved automatically.
:param source_file_regexes: a set of regexes that match source files belonging to this
module. These regexes are applied by attempting to match at the beginning of the
filename strings.
:param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
order to build and test this module (e.g. '-PprofileName').
:param environ: A dict of environment variables that should be set when files in this
module are changed.
:param sbt_test_goals: A set of SBT test goals for testing this module.
:param python_test_goals: A set of Python test goals for testing this module.
:param excluded_python_implementations: A set of Python implementations that are not
supported by this module's Python components. The values in this set should match
strings returned by Python's `platform.python_implementation()`.
:param test_tags A set of tags that will be excluded when running unit tests if the module
is not explicitly changed.
:param should_run_r_tests: If true, changes in this module will trigger all R tests.
:param should_run_build_tests: If true, changes in this module will trigger build tests.
"""
self.name = name
self.dependencies = dependencies
self.source_file_prefixes = source_file_regexes
self.sbt_test_goals = sbt_test_goals
self.build_profile_flags = build_profile_flags
self.environ = environ or {}
self.python_test_goals = python_test_goals
self.excluded_python_implementations = excluded_python_implementations
self.test_tags = test_tags
self.should_run_r_tests = should_run_r_tests
self.should_run_build_tests = should_run_build_tests
self.dependent_modules = set()
for dep in dependencies:
dep.dependent_modules.add(self)
all_modules.append(self)
def contains_file(self, filename):
return any(re.match(p, filename) for p in self.source_file_prefixes)
def __repr__(self):
return "Module<%s>" % self.name
def __lt__(self, other):
return self.name < other.name
def __eq__(self, other):
return self.name == other.name
def __ne__(self, other):
return not (self.name == other.name)
def __hash__(self):
return hash(self.name)
tags = Module(
name="tags",
dependencies=[],
source_file_regexes=[
"common/tags/",
],
)
utils = Module(
name="utils",
dependencies=[tags],
source_file_regexes=[
"common/utils/",
],
sbt_test_goals=[
"common-utils/test",
],
)
kvstore = Module(
name="kvstore",
dependencies=[tags],
source_file_regexes=[
"common/kvstore/",
],
sbt_test_goals=[
"kvstore/test",
],
)
network_common = Module(
name="network-common",
dependencies=[tags, utils],
source_file_regexes=[
"common/network-common/",
],
sbt_test_goals=[
"network-common/test",
],
)
network_shuffle = Module(
name="network-shuffle",
dependencies=[tags],
source_file_regexes=[
"common/network-shuffle/",
],
sbt_test_goals=[
"network-shuffle/test",
],
)
unsafe = Module(
name="unsafe",
dependencies=[tags, utils],
source_file_regexes=[
"common/unsafe",
],
sbt_test_goals=[
"unsafe/test",
],
)
launcher = Module(
name="launcher",
dependencies=[tags],
source_file_regexes=[
"launcher/",
],
sbt_test_goals=[
"launcher/test",
],
)
sketch = Module(
name="sketch",
dependencies=[tags],
source_file_regexes=[
"common/sketch/",
],
sbt_test_goals=["sketch/test"],
)
variant = Module(
name="variant",
dependencies=[tags],
source_file_regexes=[
"common/variant/",
],
sbt_test_goals=["variant/test"],
)
core = Module(
name="core",
dependencies=[kvstore, network_common, network_shuffle, unsafe, launcher, utils],
source_file_regexes=[
"core/",
],
sbt_test_goals=[
"core/test",
],
)
api = Module(
name="api",
dependencies=[utils, unsafe],
source_file_regexes=[
"sql/api/",
],
)
catalyst = Module(
name="catalyst",
dependencies=[tags, sketch, variant, core, api],
source_file_regexes=[
"sql/catalyst/",
],
sbt_test_goals=[
"catalyst/test",
],
environ=None
if "GITHUB_ACTIONS" not in os.environ
else {"ENABLE_DOCKER_INTEGRATION_TESTS": "1"},
)
sql = Module(
name="sql",
dependencies=[catalyst],
source_file_regexes=[
"sql/core/",
"python/pyspark/sql/worker/", # analyze_udtf is invoked and tested in JVM
],
sbt_test_goals=[
"sql/test",
],
environ=None
if "GITHUB_ACTIONS" not in os.environ
else {"ENABLE_DOCKER_INTEGRATION_TESTS": "1"},
)
hive = Module(
name="hive",
dependencies=[sql],
source_file_regexes=[
"sql/hive/",
"bin/spark-sql",
],
build_profile_flags=[
"-Phive",
],
sbt_test_goals=[
"hive/test",
],
test_tags=["org.apache.spark.tags.ExtendedHiveTest"],
)
repl = Module(
name="repl",
dependencies=[hive],
source_file_regexes=[
"repl/",
],
sbt_test_goals=[
"repl/test",
],
)
hive_thriftserver = Module(
name="hive-thriftserver",
dependencies=[hive],
source_file_regexes=[
"sql/hive-thriftserver",
"sbin/start-thriftserver.sh",
],
build_profile_flags=[
"-Phive-thriftserver",
],
sbt_test_goals=[
"hive-thriftserver/test",
],
)
avro = Module(
name="avro",
dependencies=[sql],
source_file_regexes=[
"connector/avro",
],
sbt_test_goals=[
"avro/test",
],
)
sql_kafka = Module(
name="sql-kafka-0-10",
dependencies=[sql],
source_file_regexes=[
"connector/kafka-0-10-sql",
],
sbt_test_goals=[
"sql-kafka-0-10/test",
],
)
profiler = Module(
name="profiler",
dependencies=[],
build_profile_flags=["-Pjvm-profiler"],
source_file_regexes=[
"connector/profiler",
],
)
protobuf = Module(
name="protobuf",
dependencies=[sql],
source_file_regexes=[
"connector/protobuf",
],
sbt_test_goals=[
"protobuf/test",
],
)
graphx = Module(
name="graphx",
dependencies=[tags, core],
source_file_regexes=[
"graphx/",
],
sbt_test_goals=["graphx/test"],
)
streaming = Module(
name="streaming",
dependencies=[tags, core],
source_file_regexes=[
"streaming",
],
sbt_test_goals=[
"streaming/test",
],
)
# Don't set the dependencies because changes in other modules should not trigger Kinesis tests.
# Kinesis tests depends on external Amazon kinesis service. We should run these tests only when
# files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't
# fail other PRs.
streaming_kinesis_asl = Module(
name="streaming-kinesis-asl",
dependencies=[tags, core],
source_file_regexes=[
"connector/kinesis-asl/",
"connector/kinesis-asl-assembly/",
],
build_profile_flags=[
"-Pkinesis-asl",
],
environ={"ENABLE_KINESIS_TESTS": "0"},
sbt_test_goals=[
"streaming-kinesis-asl/test",
],
)
streaming_kafka_0_10 = Module(
name="streaming-kafka-0-10",
dependencies=[streaming, core],
source_file_regexes=[
# The ending "/" is necessary otherwise it will include "sql-kafka" codes
"connector/kafka-0-10/",
"connector/kafka-0-10-assembly",
"connector/kafka-0-10-token-provider",
],
sbt_test_goals=["streaming-kafka-0-10/test", "token-provider-kafka-0-10/test"],
)
mllib_local = Module(
name="mllib-local",
dependencies=[tags, core],
source_file_regexes=[
"mllib-local",
],
sbt_test_goals=[
"mllib-local/test",
],
)
mllib = Module(
name="mllib",
dependencies=[mllib_local, streaming, sql],
source_file_regexes=[
"data/mllib/",
"mllib/",
],
sbt_test_goals=[
"mllib/test",
],
)
connect = Module(
name="connect",
dependencies=[hive, avro, protobuf, mllib],
source_file_regexes=[
"sql/connect",
],
sbt_test_goals=[
"connect/test",
"connect-client-jvm/test",
],
)
examples = Module(
name="examples",
dependencies=[graphx, mllib, streaming, hive],
source_file_regexes=[
"examples/",
],
sbt_test_goals=[
"examples/test",
],
)
pyspark_core = Module(
name="pyspark-core",
dependencies=[core],
source_file_regexes=["python/(?!pyspark/(ml|mllib|sql|streaming|pandas|resource|testing))"],
python_test_goals=[
# doctests
"pyspark.conf",
"pyspark.core.rdd",
"pyspark.core.context",
"pyspark.core.broadcast",
"pyspark.accumulators",
"pyspark.core.files",
"pyspark.serializers",
"pyspark.profiler",
"pyspark.shuffle",
"pyspark.taskcontext",
"pyspark.util",
# unittests
"pyspark.tests.test_appsubmit",
"pyspark.tests.test_broadcast",
"pyspark.tests.test_conf",
"pyspark.tests.test_context",
"pyspark.tests.test_daemon",
"pyspark.tests.test_install_spark",
"pyspark.tests.test_join",
"pyspark.tests.test_memory_profiler",
"pyspark.tests.test_profiler",
"pyspark.tests.test_rdd",
"pyspark.tests.test_rddbarrier",
"pyspark.tests.test_rddsampler",
"pyspark.tests.test_readwrite",
"pyspark.tests.test_serializers",
"pyspark.tests.test_shuffle",
"pyspark.tests.test_statcounter",
"pyspark.tests.test_taskcontext",
"pyspark.tests.test_util",
"pyspark.tests.test_worker",
"pyspark.tests.test_stage_sched",
],
)
pyspark_sql = Module(
name="pyspark-sql",
dependencies=[pyspark_core, hive, avro, protobuf],
source_file_regexes=["python/pyspark/sql"],
python_test_goals=[
# doctests
"pyspark.sql.types",
"pyspark.sql.context",
"pyspark.sql.session",
"pyspark.sql.conf",
"pyspark.sql.catalog",
"pyspark.sql.classic.column",
"pyspark.sql.classic.dataframe",
"pyspark.sql.classic.window",
"pyspark.sql.datasource",
"pyspark.sql.group",
"pyspark.sql.functions.builtin",
"pyspark.sql.functions.partitioning",
"pyspark.sql.merge",
"pyspark.sql.readwriter",
"pyspark.sql.streaming.query",
"pyspark.sql.streaming.readwriter",
"pyspark.sql.streaming.listener",
"pyspark.sql.udf",
"pyspark.sql.udtf",
"pyspark.sql.avro.functions",
"pyspark.sql.protobuf.functions",
"pyspark.sql.pandas.conversion",
"pyspark.sql.pandas.map_ops",
"pyspark.sql.pandas.group_ops",
"pyspark.sql.pandas.types",
"pyspark.sql.pandas.serializers",
"pyspark.sql.pandas.typehints",
"pyspark.sql.pandas.utils",
"pyspark.sql.observation",
"pyspark.sql.tvf",
# unittests
"pyspark.sql.tests.test_artifact",
"pyspark.sql.tests.test_catalog",
"pyspark.sql.tests.test_column",
"pyspark.sql.tests.test_conf",
"pyspark.sql.tests.test_context",
"pyspark.sql.tests.test_dataframe",
"pyspark.sql.tests.test_collection",
"pyspark.sql.tests.test_creation",
"pyspark.sql.tests.test_listener",
"pyspark.sql.tests.test_observation",
"pyspark.sql.tests.test_repartition",
"pyspark.sql.tests.test_stat",
"pyspark.sql.tests.test_datasources",
"pyspark.sql.tests.test_errors",
"pyspark.sql.tests.test_functions",
"pyspark.sql.tests.test_group",
"pyspark.sql.tests.test_sql",
"pyspark.sql.tests.arrow.test_arrow",
"pyspark.sql.tests.arrow.test_arrow_map",
"pyspark.sql.tests.arrow.test_arrow_cogrouped_map",
"pyspark.sql.tests.arrow.test_arrow_grouped_map",
"pyspark.sql.tests.arrow.test_arrow_python_udf",
"pyspark.sql.tests.pandas.test_pandas_cogrouped_map",
"pyspark.sql.tests.pandas.test_pandas_grouped_map",
"pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state",
"pyspark.sql.tests.pandas.test_pandas_map",
"pyspark.sql.tests.pandas.test_pandas_transform_with_state",
"pyspark.sql.tests.pandas.test_pandas_udf",
"pyspark.sql.tests.pandas.test_pandas_udf_grouped_agg",
"pyspark.sql.tests.pandas.test_pandas_udf_scalar",
"pyspark.sql.tests.pandas.test_pandas_udf_typehints",
"pyspark.sql.tests.pandas.test_pandas_udf_typehints_with_future_annotations",
"pyspark.sql.tests.pandas.test_pandas_udf_window",
"pyspark.sql.tests.pandas.test_pandas_sqlmetrics",
"pyspark.sql.tests.pandas.test_converter",
"pyspark.sql.tests.test_python_datasource",
"pyspark.sql.tests.test_python_streaming_datasource",
"pyspark.sql.tests.test_readwriter",
"pyspark.sql.tests.test_serde",
"pyspark.sql.tests.test_session",
"pyspark.sql.tests.streaming.test_streaming",
"pyspark.sql.tests.streaming.test_streaming_foreach",
"pyspark.sql.tests.streaming.test_streaming_foreach_batch",
"pyspark.sql.tests.streaming.test_streaming_listener",
"pyspark.sql.tests.test_subquery",
"pyspark.sql.tests.test_types",
"pyspark.sql.tests.test_udf",
"pyspark.sql.tests.test_udf_profiler",
"pyspark.sql.tests.test_udtf",
"pyspark.sql.tests.test_tvf",
"pyspark.sql.tests.test_utils",
"pyspark.sql.tests.test_resources",
"pyspark.sql.tests.plot.test_frame_plot",
"pyspark.sql.tests.plot.test_frame_plot_plotly",
"pyspark.sql.tests.test_connect_compatibility",
],
)
pyspark_testing = Module(
name="pyspark-testing",
dependencies=[pyspark_core, pyspark_sql],
source_file_regexes=["python/pyspark/testing"],
python_test_goals=[
# doctests
"pyspark.testing.utils",
"pyspark.testing.pandasutils",
],
)
pyspark_resource = Module(
name="pyspark-resource",
dependencies=[pyspark_core],
source_file_regexes=["python/pyspark/resource"],
python_test_goals=[
# doctests
"pyspark.resource.profile",
# unittests
"pyspark.resource.tests.test_resources",
"pyspark.resource.tests.test_connect_resources",
],
)
pyspark_streaming = Module(
name="pyspark-streaming",
dependencies=[pyspark_core, streaming, streaming_kinesis_asl],
source_file_regexes=["python/pyspark/streaming"],
python_test_goals=[
# doctests
"pyspark.streaming.util",
# unittests
"pyspark.streaming.tests.test_context",
"pyspark.streaming.tests.test_dstream",
"pyspark.streaming.tests.test_kinesis",
"pyspark.streaming.tests.test_listener",
],
)
pyspark_mllib = Module(
name="pyspark-mllib",
dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib],
source_file_regexes=["python/pyspark/mllib"],
python_test_goals=[
# doctests
"pyspark.mllib.classification",
"pyspark.mllib.clustering",
"pyspark.mllib.evaluation",
"pyspark.mllib.feature",
"pyspark.mllib.fpm",
"pyspark.mllib.linalg.__init__",
"pyspark.mllib.linalg.distributed",
"pyspark.mllib.random",
"pyspark.mllib.recommendation",
"pyspark.mllib.regression",
"pyspark.mllib.stat._statistics",
"pyspark.mllib.stat.KernelDensity",
"pyspark.mllib.tree",
"pyspark.mllib.util",
# unittests
"pyspark.mllib.tests.test_algorithms",
"pyspark.mllib.tests.test_feature",
"pyspark.mllib.tests.test_linalg",
"pyspark.mllib.tests.test_stat",
"pyspark.mllib.tests.test_streaming_algorithms",
"pyspark.mllib.tests.test_util",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there
],
)
pyspark_ml = Module(
name="pyspark-ml",
dependencies=[pyspark_core, pyspark_mllib],
source_file_regexes=["python/pyspark/ml/"],
python_test_goals=[
# doctests
"pyspark.ml.classification",
"pyspark.ml.clustering",
"pyspark.ml.evaluation",
"pyspark.ml.feature",
"pyspark.ml.fpm",
"pyspark.ml.functions",
"pyspark.ml.image",
"pyspark.ml.linalg.__init__",
"pyspark.ml.recommendation",
"pyspark.ml.regression",
"pyspark.ml.stat",
"pyspark.ml.tuning",
# unittests
"pyspark.ml.tests.test_algorithms",
"pyspark.ml.tests.test_als",
"pyspark.ml.tests.test_fpm",
"pyspark.ml.tests.test_base",
"pyspark.ml.tests.test_evaluation",
"pyspark.ml.tests.test_feature",
"pyspark.ml.tests.test_functions",
"pyspark.ml.tests.test_image",
"pyspark.ml.tests.test_linalg",
"pyspark.ml.tests.test_model_cache",
"pyspark.ml.tests.test_param",
"pyspark.ml.tests.test_persistence",
"pyspark.ml.tests.test_pipeline",
"pyspark.ml.tests.test_tuning",
"pyspark.ml.tests.test_ovr",
"pyspark.ml.tests.test_stat",
"pyspark.ml.tests.test_training_summary",
"pyspark.ml.tests.tuning.test_tuning",
"pyspark.ml.tests.tuning.test_cv_io_basic",
"pyspark.ml.tests.tuning.test_cv_io_nested",
"pyspark.ml.tests.tuning.test_cv_io_pipeline",
"pyspark.ml.tests.tuning.test_tvs_io_basic",
"pyspark.ml.tests.tuning.test_tvs_io_nested",
"pyspark.ml.tests.tuning.test_tvs_io_pipeline",
"pyspark.ml.tests.test_util",
"pyspark.ml.tests.test_wrapper",
"pyspark.ml.torch.tests.test_distributor",
"pyspark.ml.torch.tests.test_log_communication",
"pyspark.ml.torch.tests.test_data_loader",
"pyspark.ml.deepspeed.tests.test_deepspeed_distributor",
"pyspark.ml.tests.connect.test_legacy_mode_summarizer",
"pyspark.ml.tests.connect.test_legacy_mode_evaluation",
"pyspark.ml.tests.connect.test_legacy_mode_feature",
"pyspark.ml.tests.connect.test_legacy_mode_classification",
"pyspark.ml.tests.connect.test_legacy_mode_pipeline",
"pyspark.ml.tests.connect.test_legacy_mode_tuning",
"pyspark.ml.tests.test_classification",
"pyspark.ml.tests.test_regression",
"pyspark.ml.tests.test_clustering",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there
],
)
pyspark_pandas = Module(
name="pyspark-pandas",
dependencies=[pyspark_core, pyspark_sql],
source_file_regexes=["python/pyspark/pandas/"],
python_test_goals=[
# doctests
"pyspark.pandas.accessors",
"pyspark.pandas.base",
"pyspark.pandas.categorical",
"pyspark.pandas.config",
"pyspark.pandas.datetimes",
"pyspark.pandas.exceptions",
"pyspark.pandas.extensions",
"pyspark.pandas.groupby",
"pyspark.pandas.indexing",
"pyspark.pandas.internal",
"pyspark.pandas.mlflow",
"pyspark.pandas.namespace",
"pyspark.pandas.numpy_compat",
"pyspark.pandas.sql_processor",
"pyspark.pandas.sql_formatter",
"pyspark.pandas.strings",
"pyspark.pandas.supported_api_gen",
"pyspark.pandas.utils",
"pyspark.pandas.window",
"pyspark.pandas.indexes.base",
"pyspark.pandas.indexes.category",
"pyspark.pandas.indexes.datetimes",
"pyspark.pandas.indexes.timedelta",
"pyspark.pandas.indexes.multi",
"pyspark.pandas.spark.accessors",
"pyspark.pandas.spark.utils",
"pyspark.pandas.typedef.typehints",
# unittests
"pyspark.pandas.tests.test_categorical",
"pyspark.pandas.tests.test_config",
"pyspark.pandas.tests.test_extension",
"pyspark.pandas.tests.test_frame_spark",
"pyspark.pandas.tests.test_generic_functions",
"pyspark.pandas.tests.test_indexops_spark",
"pyspark.pandas.tests.test_internal",
"pyspark.pandas.tests.test_namespace",
"pyspark.pandas.tests.test_numpy_compat",
"pyspark.pandas.tests.test_repr",
"pyspark.pandas.tests.test_spark_functions",
"pyspark.pandas.tests.test_scalars",
"pyspark.pandas.tests.test_sql",
"pyspark.pandas.tests.test_typedef",
"pyspark.pandas.tests.test_utils",
"pyspark.pandas.tests.computation.test_any_all",
"pyspark.pandas.tests.computation.test_apply_func",
"pyspark.pandas.tests.computation.test_binary_ops",
"pyspark.pandas.tests.computation.test_combine",
"pyspark.pandas.tests.computation.test_compute",
"pyspark.pandas.tests.computation.test_corr",
"pyspark.pandas.tests.computation.test_corrwith",
"pyspark.pandas.tests.computation.test_cov",
"pyspark.pandas.tests.computation.test_cumulative",
"pyspark.pandas.tests.computation.test_describe",
"pyspark.pandas.tests.computation.test_eval",
"pyspark.pandas.tests.computation.test_melt",
"pyspark.pandas.tests.computation.test_missing_data",
"pyspark.pandas.tests.computation.test_pivot",
"pyspark.pandas.tests.computation.test_pivot_table",
"pyspark.pandas.tests.computation.test_pivot_table_adv",
"pyspark.pandas.tests.computation.test_pivot_table_multi_idx",
"pyspark.pandas.tests.computation.test_pivot_table_multi_idx_adv",
"pyspark.pandas.tests.computation.test_stats",
"pyspark.pandas.tests.data_type_ops.test_as_type",
"pyspark.pandas.tests.data_type_ops.test_base",
"pyspark.pandas.tests.data_type_ops.test_binary_ops",
"pyspark.pandas.tests.data_type_ops.test_boolean_ops",
"pyspark.pandas.tests.data_type_ops.test_categorical_ops",
"pyspark.pandas.tests.data_type_ops.test_complex_ops",
"pyspark.pandas.tests.data_type_ops.test_date_ops",
"pyspark.pandas.tests.data_type_ops.test_datetime_ops",
"pyspark.pandas.tests.data_type_ops.test_null_ops",
"pyspark.pandas.tests.data_type_ops.test_num_ops",
"pyspark.pandas.tests.data_type_ops.test_num_arithmetic",
"pyspark.pandas.tests.data_type_ops.test_num_mod",
"pyspark.pandas.tests.data_type_ops.test_num_mul_div",
"pyspark.pandas.tests.data_type_ops.test_num_pow",
"pyspark.pandas.tests.data_type_ops.test_num_reverse",
"pyspark.pandas.tests.data_type_ops.test_string_ops",
"pyspark.pandas.tests.data_type_ops.test_udt_ops",
"pyspark.pandas.tests.data_type_ops.test_timedelta_ops",
"pyspark.pandas.tests.plot.test_frame_plot",
"pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
"pyspark.pandas.tests.plot.test_frame_plot_plotly",
"pyspark.pandas.tests.plot.test_series_plot",
"pyspark.pandas.tests.plot.test_series_plot_matplotlib",
"pyspark.pandas.tests.plot.test_series_plot_plotly",
"pyspark.pandas.tests.frame.test_interpolate",
"pyspark.pandas.tests.frame.test_interpolate_error",
"pyspark.pandas.tests.frame.test_attrs",
"pyspark.pandas.tests.frame.test_axis",
"pyspark.pandas.tests.frame.test_constructor",
"pyspark.pandas.tests.frame.test_conversion",
"pyspark.pandas.tests.frame.test_reindexing",
"pyspark.pandas.tests.frame.test_reshaping",
"pyspark.pandas.tests.frame.test_spark",
"pyspark.pandas.tests.frame.test_take",
"pyspark.pandas.tests.frame.test_take_adv",
"pyspark.pandas.tests.frame.test_time_series",
"pyspark.pandas.tests.frame.test_truncate",
"pyspark.pandas.tests.series.test_interpolate",
"pyspark.pandas.tests.resample.test_on",
"pyspark.pandas.tests.resample.test_error",
"pyspark.pandas.tests.resample.test_frame",
"pyspark.pandas.tests.resample.test_missing",
"pyspark.pandas.tests.resample.test_series",
"pyspark.pandas.tests.resample.test_timezone",
"pyspark.pandas.tests.reshape.test_get_dummies",
"pyspark.pandas.tests.reshape.test_get_dummies_kwargs",
"pyspark.pandas.tests.reshape.test_get_dummies_multiindex",
"pyspark.pandas.tests.reshape.test_get_dummies_object",
"pyspark.pandas.tests.reshape.test_get_dummies_prefix",
"pyspark.pandas.tests.reshape.test_merge_asof",
"pyspark.pandas.tests.window.test_expanding",
"pyspark.pandas.tests.window.test_expanding_adv",
"pyspark.pandas.tests.window.test_expanding_error",
"pyspark.pandas.tests.window.test_groupby_expanding",
"pyspark.pandas.tests.window.test_groupby_expanding_adv",
"pyspark.pandas.tests.window.test_ewm_error",
"pyspark.pandas.tests.window.test_ewm_mean",
"pyspark.pandas.tests.window.test_groupby_ewm_mean",
"pyspark.pandas.tests.window.test_missing",
"pyspark.pandas.tests.window.test_rolling",
"pyspark.pandas.tests.window.test_rolling_adv",
"pyspark.pandas.tests.window.test_rolling_count",
"pyspark.pandas.tests.window.test_rolling_error",
"pyspark.pandas.tests.window.test_groupby_rolling",
"pyspark.pandas.tests.window.test_groupby_rolling_adv",
"pyspark.pandas.tests.window.test_groupby_rolling_count",
"pyspark.pandas.tests.series.test_datetime",
"pyspark.pandas.tests.series.test_string_ops_adv",
"pyspark.pandas.tests.series.test_string_ops_basic",
"pyspark.pandas.tests.series.test_all_any",
"pyspark.pandas.tests.series.test_arg_ops",
"pyspark.pandas.tests.series.test_as_of",
"pyspark.pandas.tests.series.test_as_type",
"pyspark.pandas.tests.series.test_compute",
"pyspark.pandas.tests.series.test_conversion",
"pyspark.pandas.tests.series.test_cumulative",
"pyspark.pandas.tests.series.test_index",
"pyspark.pandas.tests.series.test_missing_data",
"pyspark.pandas.tests.series.test_series",
"pyspark.pandas.tests.series.test_sort",
"pyspark.pandas.tests.series.test_stat",
"pyspark.pandas.tests.io.test_io",
"pyspark.pandas.tests.io.test_csv",
"pyspark.pandas.tests.io.test_feather",
"pyspark.pandas.tests.io.test_stata",
"pyspark.pandas.tests.io.test_dataframe_conversion",
"pyspark.pandas.tests.io.test_dataframe_spark_io",
"pyspark.pandas.tests.io.test_series_conversion",
# fallback
"pyspark.pandas.tests.frame.test_asfreq",
"pyspark.pandas.tests.frame.test_asof",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
],
)
pyspark_pandas_slow = Module(
name="pyspark-pandas-slow",
dependencies=[pyspark_core, pyspark_sql],
source_file_regexes=["python/pyspark/pandas/"],
python_test_goals=[
# doctests
"pyspark.pandas.frame",
"pyspark.pandas.generic",
"pyspark.pandas.series",
# unittests
"pyspark.pandas.tests.indexes.test_default",
"pyspark.pandas.tests.indexes.test_category",
"pyspark.pandas.tests.indexes.test_timedelta",
"pyspark.pandas.tests.indexes.test_basic",
"pyspark.pandas.tests.indexes.test_getattr",
"pyspark.pandas.tests.indexes.test_name",
"pyspark.pandas.tests.indexes.test_conversion",
"pyspark.pandas.tests.indexes.test_drop",
"pyspark.pandas.tests.indexes.test_level",
"pyspark.pandas.tests.indexes.test_missing",
"pyspark.pandas.tests.indexes.test_repeat",
"pyspark.pandas.tests.indexes.test_sort",
"pyspark.pandas.tests.indexes.test_stat",
"pyspark.pandas.tests.indexes.test_symmetric_diff",
"pyspark.pandas.tests.indexes.test_take",
"pyspark.pandas.tests.indexes.test_unique",
"pyspark.pandas.tests.indexes.test_asof",
"pyspark.pandas.tests.indexes.test_astype",
"pyspark.pandas.tests.indexes.test_delete",
"pyspark.pandas.tests.indexes.test_diff",
"pyspark.pandas.tests.indexes.test_insert",
"pyspark.pandas.tests.indexes.test_map",
"pyspark.pandas.tests.indexes.test_append",
"pyspark.pandas.tests.indexes.test_intersection",
"pyspark.pandas.tests.indexes.test_monotonic",
"pyspark.pandas.tests.indexes.test_union",
"pyspark.pandas.tests.indexes.test_datetime",
"pyspark.pandas.tests.indexes.test_datetime_at",
"pyspark.pandas.tests.indexes.test_datetime_between",
"pyspark.pandas.tests.indexes.test_datetime_ceil",
"pyspark.pandas.tests.indexes.test_datetime_floor",
"pyspark.pandas.tests.indexes.test_datetime_iso",
"pyspark.pandas.tests.indexes.test_datetime_map",
"pyspark.pandas.tests.indexes.test_datetime_property",
"pyspark.pandas.tests.indexes.test_datetime_round",
"pyspark.pandas.tests.indexes.test_align",
"pyspark.pandas.tests.indexes.test_indexing",
"pyspark.pandas.tests.indexes.test_indexing_adv",
"pyspark.pandas.tests.indexes.test_indexing_basic",
"pyspark.pandas.tests.indexes.test_indexing_iloc",
"pyspark.pandas.tests.indexes.test_indexing_loc",
"pyspark.pandas.tests.indexes.test_indexing_loc_2d",
"pyspark.pandas.tests.indexes.test_indexing_loc_multi_idx",
"pyspark.pandas.tests.indexes.test_reindex",
"pyspark.pandas.tests.indexes.test_rename",
"pyspark.pandas.tests.indexes.test_reset_index",
"pyspark.pandas.tests.groupby.test_aggregate",
"pyspark.pandas.tests.groupby.test_apply_func",
"pyspark.pandas.tests.groupby.test_corr",
"pyspark.pandas.tests.groupby.test_cumulative",
"pyspark.pandas.tests.groupby.test_describe",
"pyspark.pandas.tests.groupby.test_groupby",
"pyspark.pandas.tests.groupby.test_grouping",
"pyspark.pandas.tests.groupby.test_head_tail",
"pyspark.pandas.tests.groupby.test_index",
"pyspark.pandas.tests.groupby.test_missing",
"pyspark.pandas.tests.groupby.test_missing_data",
"pyspark.pandas.tests.groupby.test_nlargest_nsmallest",
"pyspark.pandas.tests.groupby.test_raises",
"pyspark.pandas.tests.groupby.test_rank",
"pyspark.pandas.tests.groupby.test_size",
"pyspark.pandas.tests.groupby.test_split_apply",
"pyspark.pandas.tests.groupby.test_split_apply_count",
"pyspark.pandas.tests.groupby.test_split_apply_first",
"pyspark.pandas.tests.groupby.test_split_apply_last",
"pyspark.pandas.tests.groupby.test_split_apply_min_max",
"pyspark.pandas.tests.groupby.test_split_apply_skew",
"pyspark.pandas.tests.groupby.test_split_apply_std",
"pyspark.pandas.tests.groupby.test_split_apply_var",
"pyspark.pandas.tests.groupby.test_stat",
"pyspark.pandas.tests.groupby.test_stat_adv",
"pyspark.pandas.tests.groupby.test_stat_ddof",
"pyspark.pandas.tests.groupby.test_stat_func",
"pyspark.pandas.tests.groupby.test_stat_prod",
"pyspark.pandas.tests.groupby.test_value_counts",
"pyspark.pandas.tests.diff_frames_ops.test_align",
"pyspark.pandas.tests.diff_frames_ops.test_arithmetic",
"pyspark.pandas.tests.diff_frames_ops.test_arithmetic_ext",
"pyspark.pandas.tests.diff_frames_ops.test_arithmetic_ext_float",
"pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain",
"pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain_ext",
"pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain_ext_float",
"pyspark.pandas.tests.diff_frames_ops.test_assign_frame",
"pyspark.pandas.tests.diff_frames_ops.test_assign_series",
"pyspark.pandas.tests.diff_frames_ops.test_basic",
"pyspark.pandas.tests.diff_frames_ops.test_bitwise",
"pyspark.pandas.tests.diff_frames_ops.test_combine_first",
"pyspark.pandas.tests.diff_frames_ops.test_compare_series",
"pyspark.pandas.tests.diff_frames_ops.test_concat_inner",
"pyspark.pandas.tests.diff_frames_ops.test_concat_outer",
"pyspark.pandas.tests.diff_frames_ops.test_basic_slow",
"pyspark.pandas.tests.diff_frames_ops.test_cov",
"pyspark.pandas.tests.diff_frames_ops.test_corrwith",
"pyspark.pandas.tests.diff_frames_ops.test_dot_frame",
"pyspark.pandas.tests.diff_frames_ops.test_dot_series",
"pyspark.pandas.tests.diff_frames_ops.test_error",
"pyspark.pandas.tests.diff_frames_ops.test_index",
"pyspark.pandas.tests.diff_frames_ops.test_series",
"pyspark.pandas.tests.diff_frames_ops.test_setitem_frame",
"pyspark.pandas.tests.diff_frames_ops.test_setitem_series",
"pyspark.pandas.tests.diff_frames_ops.test_groupby",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_aggregate",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_apply",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_cumulative",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_diff",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_diff_len",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_fillna",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_filter",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_shift",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_split_apply_combine",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_transform",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_adv",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_count",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv",
"pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
],
)
pyspark_connect = Module(
name="pyspark-connect",
dependencies=[pyspark_sql, connect],
source_file_regexes=[
"python/pyspark/sql/connect",
],
python_test_goals=[
# sql doctests
"pyspark.sql.connect.catalog",
"pyspark.sql.connect.conf",
"pyspark.sql.connect.group",
"pyspark.sql.connect.session",
"pyspark.sql.connect.window",
"pyspark.sql.connect.column",
"pyspark.sql.connect.merge",
"pyspark.sql.connect.readwriter",
"pyspark.sql.connect.dataframe",
"pyspark.sql.connect.functions.builtin",
"pyspark.sql.connect.functions.partitioning",
"pyspark.sql.connect.observation",
"pyspark.sql.connect.avro.functions",
"pyspark.sql.connect.protobuf.functions",
"pyspark.sql.connect.streaming.readwriter",
"pyspark.sql.connect.streaming.query",
"pyspark.sql.connect.tvf",
# sql unittests
"pyspark.sql.tests.connect.test_connect_plan",
"pyspark.sql.tests.connect.test_connect_basic",
"pyspark.sql.tests.connect.test_connect_dataframe_property",
"pyspark.sql.tests.connect.test_connect_channel",
"pyspark.sql.tests.connect.test_connect_error",
"pyspark.sql.tests.connect.test_connect_function",
"pyspark.sql.tests.connect.test_connect_collection",
"pyspark.sql.tests.connect.test_connect_column",
"pyspark.sql.tests.connect.test_connect_creation",
"pyspark.sql.tests.connect.test_connect_readwriter",
"pyspark.sql.tests.connect.test_connect_retry",
"pyspark.sql.tests.connect.test_connect_session",
"pyspark.sql.tests.connect.test_connect_stat",
"pyspark.sql.tests.connect.test_parity_datasources",
"pyspark.sql.tests.connect.test_parity_errors",
"pyspark.sql.tests.connect.test_parity_catalog",
"pyspark.sql.tests.connect.test_parity_conf",
"pyspark.sql.tests.connect.test_parity_serde",
"pyspark.sql.tests.connect.test_parity_functions",
"pyspark.sql.tests.connect.test_parity_group",
"pyspark.sql.tests.connect.test_parity_sql",
"pyspark.sql.tests.connect.test_parity_dataframe",
"pyspark.sql.tests.connect.test_parity_collection",
"pyspark.sql.tests.connect.test_parity_creation",
"pyspark.sql.tests.connect.test_parity_observation",
"pyspark.sql.tests.connect.test_parity_repartition",
"pyspark.sql.tests.connect.test_parity_stat",
"pyspark.sql.tests.connect.test_parity_subquery",
"pyspark.sql.tests.connect.test_parity_types",
"pyspark.sql.tests.connect.test_parity_column",
"pyspark.sql.tests.connect.test_parity_readwriter",
"pyspark.sql.tests.connect.test_parity_udf",
"pyspark.sql.tests.connect.test_parity_udf_profiler",
"pyspark.sql.tests.connect.test_parity_memory_profiler",
"pyspark.sql.tests.connect.test_parity_udtf",
"pyspark.sql.tests.connect.test_parity_tvf",
"pyspark.sql.tests.connect.test_parity_python_datasource",
"pyspark.sql.tests.connect.test_parity_python_streaming_datasource",
"pyspark.sql.tests.connect.test_parity_frame_plot",
"pyspark.sql.tests.connect.test_parity_frame_plot_plotly",
"pyspark.sql.tests.connect.test_utils",
"pyspark.sql.tests.connect.client.test_artifact",
"pyspark.sql.tests.connect.client.test_artifact_localcluster",
"pyspark.sql.tests.connect.client.test_client",
"pyspark.sql.tests.connect.client.test_reattach",
"pyspark.sql.tests.connect.streaming.test_parity_streaming",
"pyspark.sql.tests.connect.streaming.test_parity_listener",
"pyspark.sql.tests.connect.streaming.test_parity_foreach",
"pyspark.sql.tests.connect.streaming.test_parity_foreach_batch",
"pyspark.sql.tests.connect.test_resources",
"pyspark.sql.tests.connect.shell.test_progress",
"pyspark.sql.tests.connect.test_df_debug",
"pyspark.sql.tests.connect.arrow.test_parity_arrow",
"pyspark.sql.tests.connect.arrow.test_parity_arrow_map",
"pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map",
"pyspark.sql.tests.connect.arrow.test_parity_arrow_cogrouped_map",
"pyspark.sql.tests.connect.arrow.test_parity_arrow_python_udf",
"pyspark.sql.tests.connect.pandas.test_parity_pandas_map",
"pyspark.sql.tests.connect.pandas.test_parity_pandas_grouped_map",
"pyspark.sql.tests.connect.pandas.test_parity_pandas_grouped_map_with_state",
"pyspark.sql.tests.connect.pandas.test_parity_pandas_cogrouped_map",
"pyspark.sql.tests.connect.pandas.test_parity_pandas_udf",
"pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_scalar",
"pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_grouped_agg",
"pyspark.sql.tests.connect.pandas.test_parity_pandas_udf_window",
"pyspark.sql.tests.connect.pandas.test_parity_pandas_transform_with_state",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
],
)
pyspark_ml_connect = Module(
name="pyspark-ml-connect",
dependencies=[pyspark_connect, pyspark_ml],
source_file_regexes=[
"python/pyspark/ml/connect",
],
python_test_goals=[
# ml doctests
"pyspark.ml.connect.functions",
# ml unittests
"pyspark.ml.tests.connect.test_connect_cache",
"pyspark.ml.tests.connect.test_connect_function",
"pyspark.ml.tests.connect.test_parity_torch_distributor",
"pyspark.ml.tests.connect.test_parity_torch_data_loader",
"pyspark.ml.tests.connect.test_connect_summarizer",
"pyspark.ml.tests.connect.test_connect_evaluation",
"pyspark.ml.tests.connect.test_connect_feature",
"pyspark.ml.tests.connect.test_connect_classification",
"pyspark.ml.tests.connect.test_connect_pipeline",
"pyspark.ml.tests.connect.test_connect_tuning",
"pyspark.ml.tests.connect.test_parity_als",
"pyspark.ml.tests.connect.test_parity_fpm",
"pyspark.ml.tests.connect.test_parity_classification",
"pyspark.ml.tests.connect.test_parity_regression",
"pyspark.ml.tests.connect.test_parity_clustering",
"pyspark.ml.tests.connect.test_parity_evaluation",
"pyspark.ml.tests.connect.test_parity_feature",
"pyspark.ml.tests.connect.test_parity_functions",
"pyspark.ml.tests.connect.test_parity_pipeline",
"pyspark.ml.tests.connect.test_parity_tuning",
"pyspark.ml.tests.connect.test_parity_ovr",
"pyspark.ml.tests.connect.test_parity_stat",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
],
)
pyspark_pandas_connect_part0 = Module(
name="pyspark-pandas-connect-part0",
dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
source_file_regexes=[
"python/pyspark/pandas",
],
python_test_goals=[
# unittests dedicated for Spark Connect
"pyspark.pandas.tests.connect.test_connect_plotting",
# pandas-on-Spark unittests
"pyspark.pandas.tests.connect.test_parity_categorical",
"pyspark.pandas.tests.connect.test_parity_config",
"pyspark.pandas.tests.connect.test_parity_extension",
"pyspark.pandas.tests.connect.test_parity_frame_spark",
"pyspark.pandas.tests.connect.test_parity_generic_functions",
"pyspark.pandas.tests.connect.test_parity_indexops_spark",
"pyspark.pandas.tests.connect.test_parity_internal",
"pyspark.pandas.tests.connect.test_parity_namespace",
"pyspark.pandas.tests.connect.test_parity_numpy_compat",
"pyspark.pandas.tests.connect.test_parity_repr",
"pyspark.pandas.tests.connect.test_parity_scalars",
"pyspark.pandas.tests.connect.test_parity_spark_functions",
"pyspark.pandas.tests.connect.test_parity_sql",
"pyspark.pandas.tests.connect.test_parity_typedef",
"pyspark.pandas.tests.connect.test_parity_utils",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_as_type",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_base",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_binary_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_boolean_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_categorical_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_complex_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_date_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_datetime_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_null_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_reverse",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_string_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_udt_ops",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_timedelta_ops",
"pyspark.pandas.tests.connect.plot.test_parity_frame_plot",
"pyspark.pandas.tests.connect.plot.test_parity_frame_plot_matplotlib",
"pyspark.pandas.tests.connect.plot.test_parity_frame_plot_plotly",
"pyspark.pandas.tests.connect.plot.test_parity_series_plot",
"pyspark.pandas.tests.connect.plot.test_parity_series_plot_matplotlib",
"pyspark.pandas.tests.connect.plot.test_parity_series_plot_plotly",
"pyspark.pandas.tests.connect.indexes.test_parity_default",
"pyspark.pandas.tests.connect.indexes.test_parity_category",
"pyspark.pandas.tests.connect.indexes.test_parity_timedelta",
"pyspark.pandas.tests.connect.indexes.test_parity_basic",
"pyspark.pandas.tests.connect.indexes.test_parity_getattr",
"pyspark.pandas.tests.connect.indexes.test_parity_name",
"pyspark.pandas.tests.connect.indexes.test_parity_conversion",
"pyspark.pandas.tests.connect.indexes.test_parity_drop",
"pyspark.pandas.tests.connect.indexes.test_parity_level",
"pyspark.pandas.tests.connect.indexes.test_parity_missing",
"pyspark.pandas.tests.connect.indexes.test_parity_repeat",
"pyspark.pandas.tests.connect.indexes.test_parity_sort",
"pyspark.pandas.tests.connect.indexes.test_parity_stat",
"pyspark.pandas.tests.connect.indexes.test_parity_symmetric_diff",
"pyspark.pandas.tests.connect.indexes.test_parity_take",
"pyspark.pandas.tests.connect.indexes.test_parity_unique",
"pyspark.pandas.tests.connect.indexes.test_parity_asof",
"pyspark.pandas.tests.connect.indexes.test_parity_astype",
"pyspark.pandas.tests.connect.indexes.test_parity_delete",
"pyspark.pandas.tests.connect.indexes.test_parity_diff",
"pyspark.pandas.tests.connect.indexes.test_parity_insert",
"pyspark.pandas.tests.connect.indexes.test_parity_map",
"pyspark.pandas.tests.connect.indexes.test_parity_align",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_adv",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_basic",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_iloc",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_2d",
"pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_multi_idx",
"pyspark.pandas.tests.connect.indexes.test_parity_reindex",
"pyspark.pandas.tests.connect.indexes.test_parity_rename",
"pyspark.pandas.tests.connect.indexes.test_parity_reset_index",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_at",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_between",
"pyspark.pandas.tests.connect.computation.test_parity_any_all",
"pyspark.pandas.tests.connect.computation.test_parity_apply_func",
"pyspark.pandas.tests.connect.computation.test_parity_binary_ops",
"pyspark.pandas.tests.connect.computation.test_parity_combine",
"pyspark.pandas.tests.connect.computation.test_parity_compute",
"pyspark.pandas.tests.connect.computation.test_parity_cov",
"pyspark.pandas.tests.connect.computation.test_parity_corr",
"pyspark.pandas.tests.connect.computation.test_parity_corrwith",
"pyspark.pandas.tests.connect.computation.test_parity_cumulative",
"pyspark.pandas.tests.connect.computation.test_parity_describe",
"pyspark.pandas.tests.connect.computation.test_parity_eval",
"pyspark.pandas.tests.connect.computation.test_parity_melt",
"pyspark.pandas.tests.connect.computation.test_parity_missing_data",
"pyspark.pandas.tests.connect.groupby.test_parity_stat",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_adv",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_ddof",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_func",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_prod",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
],
)
pyspark_pandas_connect_part1 = Module(
name="pyspark-pandas-connect-part1",
dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
source_file_regexes=[
"python/pyspark/pandas",
],
python_test_goals=[
# pandas-on-Spark unittests
"pyspark.pandas.tests.connect.frame.test_parity_attrs",
"pyspark.pandas.tests.connect.frame.test_parity_axis",
"pyspark.pandas.tests.connect.frame.test_parity_constructor",
"pyspark.pandas.tests.connect.frame.test_parity_conversion",
"pyspark.pandas.tests.connect.frame.test_parity_reindexing",
"pyspark.pandas.tests.connect.frame.test_parity_reshaping",
"pyspark.pandas.tests.connect.frame.test_parity_spark",
"pyspark.pandas.tests.connect.frame.test_parity_take",
"pyspark.pandas.tests.connect.frame.test_parity_take_adv",
"pyspark.pandas.tests.connect.frame.test_parity_time_series",
"pyspark.pandas.tests.connect.frame.test_parity_truncate",
"pyspark.pandas.tests.connect.groupby.test_parity_aggregate",
"pyspark.pandas.tests.connect.groupby.test_parity_apply_func",
"pyspark.pandas.tests.connect.groupby.test_parity_corr",
"pyspark.pandas.tests.connect.groupby.test_parity_cumulative",
"pyspark.pandas.tests.connect.groupby.test_parity_missing_data",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_count",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_first",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_last",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_skew",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_std",
"pyspark.pandas.tests.connect.groupby.test_parity_split_apply_var",
"pyspark.pandas.tests.connect.series.test_parity_datetime",
"pyspark.pandas.tests.connect.series.test_parity_string_ops_adv",
"pyspark.pandas.tests.connect.series.test_parity_string_ops_basic",
"pyspark.pandas.tests.connect.series.test_parity_all_any",
"pyspark.pandas.tests.connect.series.test_parity_arg_ops",
"pyspark.pandas.tests.connect.series.test_parity_as_of",
"pyspark.pandas.tests.connect.series.test_parity_as_type",
"pyspark.pandas.tests.connect.series.test_parity_compute",
"pyspark.pandas.tests.connect.series.test_parity_conversion",
"pyspark.pandas.tests.connect.series.test_parity_cumulative",
"pyspark.pandas.tests.connect.series.test_parity_index",
"pyspark.pandas.tests.connect.series.test_parity_missing_data",
"pyspark.pandas.tests.connect.series.test_parity_series",
"pyspark.pandas.tests.connect.series.test_parity_sort",
"pyspark.pandas.tests.connect.series.test_parity_stat",
"pyspark.pandas.tests.connect.series.test_parity_interpolate",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mod",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mul_div",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_pow",
"pyspark.pandas.tests.connect.reshape.test_parity_get_dummies",
"pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_kwargs",
"pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_multiindex",
"pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_object",
"pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_prefix",
"pyspark.pandas.tests.connect.reshape.test_parity_merge_asof",
"pyspark.pandas.tests.connect.indexes.test_parity_append",
"pyspark.pandas.tests.connect.indexes.test_parity_intersection",
"pyspark.pandas.tests.connect.indexes.test_parity_monotonic",
"pyspark.pandas.tests.connect.indexes.test_parity_union",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_ceil",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_floor",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_iso",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_map",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_property",
"pyspark.pandas.tests.connect.indexes.test_parity_datetime_round",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_shift",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_transform",
# fallback
"pyspark.pandas.tests.connect.frame.test_parity_asfreq",
"pyspark.pandas.tests.connect.frame.test_parity_asof",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
],
)
pyspark_pandas_connect_part2 = Module(
name="pyspark-pandas-connect-part2",
dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
source_file_regexes=[
"python/pyspark/pandas",
],
python_test_goals=[
# pandas-on-Spark unittests
"pyspark.pandas.tests.connect.computation.test_parity_pivot",
"pyspark.pandas.tests.connect.computation.test_parity_pivot_table",
"pyspark.pandas.tests.connect.computation.test_parity_pivot_table_adv",
"pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx",
"pyspark.pandas.tests.connect.computation.test_parity_pivot_table_multi_idx_adv",
"pyspark.pandas.tests.connect.computation.test_parity_stats",
"pyspark.pandas.tests.connect.frame.test_parity_interpolate",
"pyspark.pandas.tests.connect.frame.test_parity_interpolate_error",
"pyspark.pandas.tests.connect.resample.test_parity_frame",
"pyspark.pandas.tests.connect.resample.test_parity_series",
"pyspark.pandas.tests.connect.resample.test_parity_error",
"pyspark.pandas.tests.connect.resample.test_parity_missing",
"pyspark.pandas.tests.connect.resample.test_parity_on",
"pyspark.pandas.tests.connect.resample.test_parity_timezone",
"pyspark.pandas.tests.connect.window.test_parity_ewm_error",
"pyspark.pandas.tests.connect.window.test_parity_ewm_mean",
"pyspark.pandas.tests.connect.window.test_parity_groupby_ewm_mean",
"pyspark.pandas.tests.connect.window.test_parity_missing",
"pyspark.pandas.tests.connect.window.test_parity_rolling",
"pyspark.pandas.tests.connect.window.test_parity_rolling_adv",
"pyspark.pandas.tests.connect.window.test_parity_rolling_count",
"pyspark.pandas.tests.connect.window.test_parity_rolling_error",
"pyspark.pandas.tests.connect.window.test_parity_groupby_rolling",
"pyspark.pandas.tests.connect.window.test_parity_groupby_rolling_adv",
"pyspark.pandas.tests.connect.window.test_parity_groupby_rolling_count",
"pyspark.pandas.tests.connect.window.test_parity_expanding",
"pyspark.pandas.tests.connect.window.test_parity_expanding_adv",
"pyspark.pandas.tests.connect.window.test_parity_expanding_error",
"pyspark.pandas.tests.connect.window.test_parity_groupby_expanding",
"pyspark.pandas.tests.connect.window.test_parity_groupby_expanding_adv",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_adv",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_count",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_frame",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_series",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_error",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_align",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic_slow",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_cov",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_corrwith",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_index",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_series",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_frame",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_series",
"pyspark.pandas.tests.connect.groupby.test_parity_index",
"pyspark.pandas.tests.connect.groupby.test_parity_describe",
"pyspark.pandas.tests.connect.groupby.test_parity_head_tail",
"pyspark.pandas.tests.connect.groupby.test_parity_groupby",
"pyspark.pandas.tests.connect.groupby.test_parity_grouping",
"pyspark.pandas.tests.connect.groupby.test_parity_missing",
"pyspark.pandas.tests.connect.groupby.test_parity_nlargest_nsmallest",
"pyspark.pandas.tests.connect.groupby.test_parity_raises",
"pyspark.pandas.tests.connect.groupby.test_parity_rank",
"pyspark.pandas.tests.connect.groupby.test_parity_size",
"pyspark.pandas.tests.connect.groupby.test_parity_value_counts",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
],
)
pyspark_pandas_connect_part3 = Module(
name="pyspark-pandas-connect-part3",
dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
source_file_regexes=[
"python/pyspark/pandas",
],
python_test_goals=[
# pandas-on-Spark unittests
"pyspark.pandas.tests.connect.io.test_parity_io",
"pyspark.pandas.tests.connect.io.test_parity_csv",
"pyspark.pandas.tests.connect.io.test_parity_feather",
"pyspark.pandas.tests.connect.io.test_parity_stata",
"pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion",
"pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io",
"pyspark.pandas.tests.connect.io.test_parity_series_conversion",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_ext_float",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain_ext",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain_ext_float",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_assign_frame",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_assign_series",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_bitwise",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_combine_first",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_compare_series",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_concat_inner",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_concat_outer",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_aggregate",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_apply",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_cumulative",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_diff",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_diff_len",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_fillna",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_filter",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_split_apply_combine",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding_adv",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_expanding_count",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
],
)
pyspark_errors = Module(
name="pyspark-errors",
dependencies=[],
source_file_regexes=[
# SPARK-44544: Force the execution of pyspark_errors when there are any changes
# in PySpark, since the Python Packaging Tests is only enabled within this module.
# This module is the smallest Python test module, it contains only 1 test file
# and normally takes < 2 seconds, so the additional cost is small.
"python/",
"python/pyspark/errors",
],
python_test_goals=[
# unittests
"pyspark.errors.tests.test_connect_errors_conversion",
"pyspark.errors.tests.test_errors",
"pyspark.errors.tests.test_traceback",
"pyspark.errors.tests.connect.test_parity_traceback",
],
)
pyspark_logger = Module(
name="pyspark-logger",
dependencies=[],
source_file_regexes=["python/pyspark/logger"],
python_test_goals=[
# doctests
"pyspark.logger.logger",
# unittests
"pyspark.logger.tests.test_logger",
"pyspark.logger.tests.connect.test_parity_logger",
],
)
sparkr = Module(
name="sparkr",
dependencies=[hive, mllib],
source_file_regexes=[
"R/",
],
should_run_r_tests=True,
)
docs = Module(
name="docs",
dependencies=[],
source_file_regexes=[
"docs/",
],
)
build = Module(
name="build",
dependencies=[],
source_file_regexes=[
".*pom.xml",
"dev/test-dependencies.sh",
],
should_run_build_tests=True,
)
yarn = Module(
name="yarn",
dependencies=[],
source_file_regexes=[
"resource-managers/yarn/",
"common/network-yarn/",
],
build_profile_flags=["-Pyarn"],
sbt_test_goals=[
"yarn/test",
"network-yarn/test",
],
test_tags=["org.apache.spark.tags.ExtendedYarnTest"],
)
kubernetes = Module(
name="kubernetes",
dependencies=[],
source_file_regexes=["resource-managers/kubernetes"],
build_profile_flags=["-Pkubernetes", "-Pvolcano"],
sbt_test_goals=["kubernetes/test"],
)
hadoop_cloud = Module(
name="hadoop-cloud",
dependencies=[],
source_file_regexes=["hadoop-cloud"],
build_profile_flags=["-Phadoop-cloud"],
sbt_test_goals=["hadoop-cloud/test"],
)
spark_ganglia_lgpl = Module(
name="spark-ganglia-lgpl",
dependencies=[],
build_profile_flags=["-Pspark-ganglia-lgpl"],
source_file_regexes=[
"connector/spark-ganglia-lgpl",
],
)
docker_integration_tests = Module(
name="docker-integration-tests",
dependencies=[sql],
build_profile_flags=["-Pdocker-integration-tests"],
source_file_regexes=["connector/docker-integration-tests"],
sbt_test_goals=["docker-integration-tests/test"],
environ=None
if "GITHUB_ACTIONS" not in os.environ
else {"ENABLE_DOCKER_INTEGRATION_TESTS": "1"},
test_tags=["org.apache.spark.tags.DockerTest"],
)
# The root module is a dummy module which is used to run all of the tests.
# No other modules should directly depend on this module.
root = Module(
name="root",
dependencies=[build, core], # Changes to build should trigger all tests.
source_file_regexes=[],
# In order to run all of the tests, enable every test profile:
build_profile_flags=list(
set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))
),
sbt_test_goals=[
"test",
],
python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)),
should_run_r_tests=True,
should_run_build_tests=True,
)