core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala (17 lines):
	- line 556: * TODO SPARK-24942 Improve cluster resource management with jobs containing barrier stage
	- line 1208: // TODO: Probably should actually find among the active jobs that need this
	- line 1625: // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
	- line 1941: // TODO Refactor this out to a function that accepts a ResultStage
	- line 1979: // TODO: Perhaps we want to mark the resultStage as failed?
	- line 2069: // TODO: SPARK-35547: Clean all push-based shuffle metadata like merge enabled and
	- line 2070: // TODO: finalized as we are clearing all the merge results.
	- line 2119: // TODO: Cancel running tasks in the failed stage -- cf. SPARK-17064
	- line 2180: // TODO: support to rollback result tasks.
	- line 2213: // TODO: mark the executor as failed only if there were lots of fetch failures on it
	- line 2280: // TODO SPARK-24877 leave the zombie tasks and ignore their completion events.
	- line 2288: // TODO Refactor the failure handling logic to combine similar code with that of
	- line 2484: // TODO: SPARK-35536: Cancel finalizeShuffleMerge if the stage is cancelled
	- line 2485: // TODO: during shuffleMergeFinalizeWaitSec
	- line 2571: // TODO: Lower-level scheduler should also deal with this
	- line 2587: // TODO: SPARK-35549: Currently merge statuses results which come after shuffle merge
	- line 2588: // TODO: is finalized is not registered.


python/pyspark/pandas/series.py (16 lines):
	- line 1063: # TODO: NaN and None when ``arg`` is an empty dict
	- line 1064: # TODO: Support ps.Series ``arg``
	- line 1198: # TODO: Currently, changing index labels taking dictionary/Series is not supported.
	- line 2216: self._column_label, scol.alias(name_like_string(self.name))  # TODO: dtype?
	- line 2335: self._psdf._internal.with_new_spark_column(self._column_label, scol)  # TODO: dtype?
	- line 2381: # TODO: last two examples from pandas produce different results.
	- line 2811: # TODO: Categorical type isn't supported (due to PySpark's limitation) and
	- line 3769: # TODO: not all arguments are implemented comparing to pandas' for now.
	- line 3978: # TODO: add 'interpolation' parameter.
	- line 4060: # TODO: add axis, pct, na_option parameter
	- line 4858: # TODO: introduce 'in_place'; fully support 'regex'
	- line 5112: return self._with_new_scol(current)  # TODO: dtype?
	- line 5194: self._column_label, scol  # TODO: dtype?
	- line 5210: self._column_label, scol  # TODO: dtype?
	- line 5584: return self._with_new_scol(cond)  # TODO: dtype?
	- line 5590: sdf, index_fields=combined._internal.index_fields, data_fields=[None]  # TODO: dtype?


python/pyspark/pandas/groupby.py (15 lines):
	- line 156: # TODO: Series support is not implemented yet.
	- line 157: # TODO: not all arguments are implemented comparing to pandas' for now.
	- line 667: # TODO: 'q' accepts list like type
	- line 800: # TODO: sync the doc.
	- line 932: # TODO: sync the doc.
	- line 1072: # TODO: 1, 'n' accepts list and slice; 2, implement 'dropna' parameter
	- line 1335: # TODO: skipna should be implemented.
	- line 1377: # TODO: groupby multiply columns should be implemented.
	- line 2100: # TODO: implement 'dropna' parameter
	- line 3265: # TODO: 'adjust', 'axis', 'method' parameter should be implemented.
	- line 3854: # TODO: Implement 'percentiles', 'include', and 'exclude' arguments.
	- line 3855: # TODO: Add ``DataFrame.select_dtypes`` to See Also when 'include'
	- line 4231: # TODO: add keep parameter
	- line 4313: # TODO: add keep parameter
	- line 4398: # TODO: add bins, normalize parameter


python/pyspark/pandas/namespace.py (12 lines):
	- line 1399: # TODO: add `coerce_float` and 'parse_dates' parameters
	- line 1470: # TODO: add `coerce_float`, `params`, and 'parse_dates' parameters
	- line 1525: # TODO: add `coerce_float`, `params`, and 'parse_dates' parameters
	- line 1580: if " " not in striped:  # TODO: identify the table name or not more precisely.
	- line 2339: # TODO: there are many parameters to implement and support. See pandas's pd.concat.
	- line 2491: ):  # TODO: support dict
	- line 2670: # FIXME: better ordering
	- line 2683: # FIXME: better ordering
	- line 2690: # TODO: NaN and None difference for missing values. pandas seems to be filling NaN.
	- line 2743: data_fields=None,  # TODO: dtypes?
	- line 2842: # TODO: Add back:
	- line 2916: # TODO: Add back:


python/pyspark/pandas/generic.py (11 lines):
	- line 151: # TODO: add 'axis' parameter
	- line 211: # TODO: add 'axis' parameter
	- line 272: # TODO: add 'axis' parameter
	- line 333: # TODO: add 'axis' parameter
	- line 2393: # TODO: by argument only support the grouping name and as_index only for now. Documentation
	- line 2790: # TODO: 'center', 'win_type', 'on', 'axis' parameter should be implemented.
	- line 2822: # TODO: 'center' and 'axis' parameter should be implemented.
	- line 2846: # TODO: 'adjust', 'axis', 'method' parameter should be implemented.
	- line 3322: # TODO: add 'downcast' when value parameter exists
	- line 3401: # TODO: add 'downcast' when value parameter exists
	- line 3480: # TODO: add 'axis', 'inplace', 'downcast'


mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala (8 lines):
	- line 113: /* TODO (once LDA can be trained with Strings or given a dictionary)
	- line 132: /* TODO (once LDA can be trained with Strings or given a dictionary)
	- line 241: // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
	- line 363: // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
	- line 683: // TODO: Avoid zip, which is inefficient.
	- line 710: // TODO: generalize this for asymmetric (non-scalar) alpha
	- line 737: // TODO: generalize this for asymmetric (non-scalar) alpha
	- line 809: // TODO:


mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala (8 lines):
	- line 250: // TODO: use Breeze UFunc
	- line 263: // TODO: use Breeze UFunc
	- line 458: // TODO: squared error is more natural but converges slower
	- line 494: // TODO: allocate outputs as one big array and then create BDMs from it
	- line 523: // TODO: allocate deltas as one big array and then create BDMs from it
	- line 534: // TODO: explain why delta of top layer is null (because it might contain loss+layer)
	- line 843: // TODO: will make a copy if vector is a subvector of BDV (see Vectors code)
	- line 848: // TODO: deprecate standard optimizer because it needs Vector


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala (7 lines):
	- line 375: // Please do not insert any other rules in between. See the TODO comments in rule
	- line 629: // TODO: For Cube/Rollup just set nullability to be `true`.
	- line 739: // TODO: mark Aggregate as resolved even if it has GROUPING SETS. We can expand it at the end
	- line 911: // TODO: Don't construct the physical container until after analysis.
	- line 922: // TODO: Support Pandas UDF.
	- line 1135: // TODO (SPARK-27484): handle streaming write commands when we have them.
	- line 3937: // TODO: since the field name is already resolved, it's more efficient if


python/pyspark/ml/connect/classification.py (7 lines):
	- line 96: # TODO: add a setting seed param.
	- line 99: # TODO: support training on GPU
	- line 100: # TODO: support L1 / L2 regularization
	- line 132: # TODO: early stopping
	- line 227: # TODO: support pandas dataframe fitting
	- line 254: # TODO: support GPU.
	- line 343: # TODO: Use spark broadast for `model_state_dict`,


python/pyspark/pandas/indexes/base.py (6 lines):
	- line 437: # TODO: avoid using default index?
	- line 802: # TODO: add downcast parameter for fillna function
	- line 830: internal = InternalFrame(  # TODO: dtypes?
	- line 1208: # TODO: add error parameter
	- line 2261: # TODO: non-categorical or categorical with different categories
	- line 2322: # TODO: We can't support different type of values in a single column for now.


sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala (6 lines):
	- line 487: // TODO: In future we can have Spark support columns sorted in descending order
	- line 684: // TODO: figure out how to drop multiple partitions in one call
	- line 988: new Path(loadPath), // TODO: Use URI
	- line 998: loadPath: String, // TODO URI
	- line 1333: // TODO: stats should include all the other two fields (`numFiles` and `numPartitions`).
	- line 1346: // TODO: still fill the rowCount even if sizeInBytes is empty. Might break anything?


python/pyspark/pandas/base.py (6 lines):
	- line 262: # TODO: This is a quick hack to support NumPy type. We should revisit this.
	- line 466: # TODO: support more APIs?
	- line 966: # TODO: axis and many arguments should be implemented.
	- line 1055: # TODO: axis, skipna, and many arguments should be implemented.
	- line 1118: # TODO: add frep and axis parameter
	- line 1193: # TODO: Update Documentation for Bins Parameter when its supported


python/pyspark/pandas/window.py (6 lines):
	- line 161: # TODO: 'min_periods' is not equivalent in pandas because it does not count NA as
	- line 209: lambda psser: psser._with_new_scol(func(psser.spark.column)),  # TODO: dtype?
	- line 957: applied.append(agg_column._with_new_scol(func(agg_column.spark.column)))  # TODO: dtype?
	- line 1444: # TODO: when add 'axis' parameter, should add to here too.
	- line 2554: # TODO: when add 'adjust' parameter, should add to here too.
	- line 2665: # TODO: when add 'adjust' parameter, should add to here too.


python/pyspark/core/rdd.py (5 lines):
	- line 1055: # TODO: add log warning for when more than one iteration was run
	- line 3677: # TODO: add option to control map-side combining
	- line 3774: # TODO: add control over map-side aggregation
	- line 3982: # TODO: support variant with custom partitioner
	- line 4203: # TODO: add variant with custom partitioner


common/utils/src/main/scala/org/apache/spark/util/ClosureCleaner.scala (5 lines):
	- line 209: // TODO: clean all inner closures first. This requires us to find the inner objects.
	- line 210: // TODO: cache outerClasses / innerClasses / accessedFields
	- line 621: // TODO: maybe lift this restriction to support other functional interfaces in the future
	- line 949: // TODO: maybe lift this restriction and support other functional interfaces
	- line 1078: // TODO: Recursively find inner closures that we indirectly reference, e.g.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala (5 lines):
	- line 284: // TODO: We don't support an array value tags in map yet.
	- line 321: // TODO: This method might have to be removed. Some logics duplicate `convertObject()`
	- line 430: // TODO: we don't support partial results now
	- line 444: // TODO: find a more efficient way to convert ArrayBuffer to GenericArrayData
	- line 521: // TODO: This function unnecessarily does type dispatch. Should merge it with `castTo`.


mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala (5 lines):
	- line 182: // TODO: allow to specify label precision and feature precision.
	- line 356: // TODO: This implementation has performance issues due to unnecessary serialization.
	- line 357: // TODO: It is better (but trickier) if we can cast the old vector type to new type directly.
	- line 409: // TODO: This implementation has performance issues due to unnecessary serialization.
	- line 410: // TODO: It is better (but trickier) if we can cast the new vector type to old type directly.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TryEval.scala (5 lines):
	- line 84: // TODO: support TRY eval mode on datetime arithmetic expressions.
	- line 122: // TODO: support TRY eval mode on datetime arithmetic expressions.
	- line 159: // TODO: support TRY eval mode on datetime arithmetic expressions.
	- line 198: // TODO: support TRY eval mode on datetime arithmetic expressions.
	- line 230: // TODO: support TRY eval mode on datetime arithmetic expressions.


sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala (4 lines):
	- line 134: *   UnionObjectInspector: (tag: Int, object data) (TODO: not supported by SparkSQL yet)
	- line 278: // TODO we don't support the HiveVarcharObjectInspector yet.
	- line 842: // TODO decimal precision?
	- line 1118: // TODO precise, scale?


core/src/main/scala/org/apache/spark/MapOutputTracker.scala (4 lines):
	- line 260: // TODO support updateMergeResult for similar use cases as updateMapOutput
	- line 1675: // TODO: SPARK-35036: Instead of reading map blocks in case of AQE with Push based shuffle,
	- line 1676: // TODO: improve push based shuffle to read partial merged blocks satisfying the start/end
	- line 1677: // TODO: map indexes


sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala (4 lines):
	- line 143: // TODO: move the rest of the table commands from ddl.scala to this file
	- line 1184: // TODO: [SPARK-28692] unify this after we unify the
	- line 1232: // TODO: some Hive fileformat + row serde might be mapped to Spark data source, e.g. CSV.
	- line 1248: // TODO: should we keep Hive serde properties?


sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNode.scala (4 lines):
	- line 361: // TODO: Variable cleanup (once we add SQL script execution logic).
	- line 362: // TODO: Add interpreter tests as well.
	- line 380: // TODO: Variable cleanup (once we add SQL script execution logic).
	- line 381: // TODO: Add interpreter tests as well.


mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala (4 lines):
	- line 112: * TODO: Future extensions: The following functionality is planned for the future:
	- line 186: * TODO: Track which features are known to be continuous already; do not update counts for them.
	- line 256: // TODO: This might be able to handle 0's more efficiently.
	- line 357: // TODO: Check more carefully about whether this whole class will be included in a closure.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala (4 lines):
	- line 1254: // TODO For v2 commands, we will cast the string back to its actual value,
	- line 1621: // TODO we should use the visitRowFormatDelimited function here. However HiveScriptIOSchema
	- line 4660: // TODO we need proper support for the NULL format.
	- line 5972: // TODO a partition spec is allowed to have optional values. This is currently violated.


mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala (4 lines):
	- line 106: // TODO: support unbounded pattern length when maxPatternLength = 0
	- line 357: // TODO: We collect projected postfixes into memory. We should also compare the performance
	- line 358: // TODO: of keeping them on shuffle files.
	- line 463: // TODO: use PrimitiveKeyOpenHashMap


core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala (4 lines):
	- line 914: // TODO: manage the memory used here, and spill it into disk in case of OOM.
	- line 928: // TODO (SPARK-36284): Add shuffle checksum support for push-based shuffle
	- line 966: // TODO: release the buf here to free memory earlier
	- line 1153: // TODO SPARK-36284 Add shuffle checksum support for push-based shuffle


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala (3 lines):
	- line 259: // TODO (SPARK-27484): handle streaming write commands when we have them.
	- line 845: // TODO: although map type is not orderable, technically map type should be able to be
	- line 854: // TODO: Remove this type check once we support Variant ordering


python/pyspark/pandas/typedef/typehints.py (3 lines):
	- line 191: # TODO: considering the precision & scale for decimal type.
	- line 652: # TODO: once pandas exposes a typing module like numpy.typing, we should deprecate
	- line 675: # TODO: Remove this variadic-generic hack by tuple once ww drop Python up to 3.9.


sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala (3 lines):
	- line 156: // TODO: Move to `DistributedPlan`
	- line 163: def outputPartitioning: Partitioning = UnknownPartitioning(0) // TODO: WRONG WIDTH!
	- line 511: // TODO: refactor and reuse the code from RDD's take()


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/OperatorStateMetadata.scala (3 lines):
	- line 416: // TODO: [SPARK-50845]: Currently, deleteSchemaFiles is a no-op since earliestBatchIdKept
	- line 429: // TODO: [SPARK-50845]: Currently, deleteSchemaFiles is a no-op since thresholdBatchId
	- line 479: // TODO: [SPARK-50845]: Return earliest schema file we need after implementing


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala (3 lines):
	- line 264: // TODO: do not reorder consecutive `Add`s or `Multiply`s with different `failOnError` flags
	- line 325: // TODO: `EqualTo` for structural types are not working. Until SPARK-24443 is addressed,
	- line 326: // TODO: we exclude them in this rule.


core/src/main/scala/org/apache/spark/deploy/master/Master.scala (3 lines):
	- line 292: // TODO Prevent repeated registrations from some driver
	- line 423: // TODO: It might be good to instead have the submission client poll the master to determine
	- line 451: // TODO: It would be nice for this to be a synchronous response


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala (3 lines):
	- line 424: // TODO: we should follow hive to roll back if one partition path failed to create.
	- line 469: // TODO: we should follow hive to roll back if one partition path failed to delete, and support
	- line 502: // TODO: we should follow hive to roll back if one partition path failed to rename.


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala (3 lines):
	- line 190: // TODO: operator pushdown.
	- line 258: // TODO (SPARK-27484): we should add the writing node before the plan is analyzed.
	- line 934: // TODO validate baseStateStoreCkptId


core/src/main/scala/org/apache/spark/storage/BlockManager.scala (3 lines):
	- line 666: // TODO: We might need to rate limit re-registering.
	- line 1420: // TODO: need a better way to handle blocks with indeterminate/unordered results, replicas
	- line 2032: // TODO: Avoid a linear scan by creating another mapping of RDD.id to blocks.


mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala (3 lines):
	- line 76: // TODO: Implement SPARK-11543?
	- line 550: // TODO: for Multinomial logistic regression, take numClasses into account
	- line 1048: // TODO: get numClasses and numFeatures together from dataset


python/pyspark/pandas/accessors.py (3 lines):
	- line 328: # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate?
	- line 626: # TODO: Index will be lost in this case.
	- line 913: # TODO: In this case, it avoids the shortcut for now (but only infers schema)


core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala (3 lines):
	- line 100: // TODO: use the actual number of slots for standalone mode.
	- line 174: // TODO: We should kill any running task attempts when the task set manager becomes a zombie.
	- line 1302: // TODO: Threshold should also look at standard deviation of task durations and have a lower


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala (3 lines):
	- line 206: // TODO: it's difficult to support string operators without advanced statistics.
	- line 285: // TODO: It is difficult to support other binary comparisons for String/Binary
	- line 670: // TODO: It is difficult to support other binary comparisons for String/Binary


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala (3 lines):
	- line 546: // TODO: Implement a more accurate method for checking whether a decimal value can be cast
	- line 1089: // TODO: Could be faster?
	- line 1117: // TODO: Could be faster?


core/src/main/scala/org/apache/spark/api/r/BaseRRunner.scala (3 lines):
	- line 67: // TODO: optimize it to use one socket
	- line 184: // TODO: Pass a byte array from R to avoid this cast ?
	- line 207: // TODO: We should propagate this error to the task thread


python/pyspark/sql/pandas/types.py (3 lines):
	- line 566: # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
	- line 600: # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
	- line 670: # TODO: handle nested timestamps, such as ArrayType(TimestampType())?


sql/connect/common/src/main/protobuf/spark/connect/commands.proto (3 lines):
	- line 269: // TODO: How do we indicate errors?
	- line 270: // TODO: Consider adding status, last progress etc here.
	- line 316: // TODO: Consider reusing Explain from AnalyzePlanRequest message.


mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala (3 lines):
	- line 211: * TODO: See if we can deprecate `intercept` in `GeneralizedLinearModel`, and always
	- line 281: // TODO: Apply feature scaling to the weight vector instead of input data.
	- line 298: * TODO: For better convergence, in logistic regression, the intercepts should be computed


core/src/main/scala/org/apache/spark/rdd/RDD.scala (3 lines):
	- line 167: // TODO: Handle changes of StorageLevel
	- line 1947: // TODO We can collect all the RDDs that needs to be checkpointed, and then checkpoint
	- line 2093: // TODO: this can be per-partition. e.g. UnionRDD can have different deterministic level for


connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala (3 lines):
	- line 183: // TODO: we can avoid boxing if future version of Protobuf provide primitive accessors.
	- line 407: // TODO revisit validation of protobuf-catalyst fields.
	- line 470: // TODO: All of the code below this line is same between protobuf and avro, it can be shared.


python/pyspark/pandas/indexes/multi.py (3 lines):
	- line 690: # TODO: We might need to handle internal state change.
	- line 704: # TODO: add 'name' parameter after pd.MultiIndex.name is implemented
	- line 834: # TODO: ADD error parameter


mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala (2 lines):
	- line 29: // TODO: switch to MLlib BLAS interface
	- line 41: // TODO: add code if matrices isTranspose!!!


core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala (2 lines):
	- line 37: * TODO: This is marked as sharable to get a handle to RBackend. Is it safe to re-use
	- line 265: // TODO: find best method in matching methods.


core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala (2 lines):
	- line 99: // TODO: clean up this clunky method signature
	- line 112: // TODO: simplify this to limit each task to its own slot


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala (2 lines):
	- line 83: // TODO: To reduce code diff of SPARK-29665, we create stub implementations for file source v2, so
	- line 101: // TODO: implement a light-weight partition inference which only looks at the path of one leaf


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala (2 lines):
	- line 134: // TODO: things can go wrong if the common expression is nondeterministic. We
	- line 137: // TODO: we should calculate the ref count and also inline the common expression


sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala (2 lines):
	- line 128: // TODO: Generalize statistics collection.
	- line 129: // TODO: Why fs.getContentSummary returns wrong size on Jenkins?


sql/core/src/main/scala/org/apache/spark/sql/classic/DataStreamWriter.scala (2 lines):
	- line 150: // TODO (SPARK-27484): we should add the writing node before the plan is analyzed.
	- line 168: * TODO (SPARK-33638): Full support of v2 table creation


core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java (2 lines):
	- line 436: // TODO: try to find space on previous pages
	- line 757: * TODO: support forced spilling


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala (2 lines):
	- line 1078: // TODO: although map type is not orderable, technically map type should be able to be used
	- line 1123: // TODO: although map type is not orderable, technically map type should be able to be used


mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala (2 lines):
	- line 70: // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
	- line 114: // TODO: This hashDistance function requires more discussion in SPARK-18454


core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java (2 lines):
	- line 113: // TODO: we're wasting 32 bits of space here; we can probably store fewer bits of the hashcode
	- line 353: // TODO: use existing ShuffleWriteMetrics


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala (2 lines):
	- line 321: // TODO: add support for multiple col families with HDFSBackedStateStoreProvider
	- line 422: // TODO: The validation should be moved to a higher level so that it works for all state store


mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala (2 lines):
	- line 238: // TODO: find a fast and stable way for sparse data.
	- line 362: // TODO: The conditions below are not fully tested.


sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala (2 lines):
	- line 69: // TODO: We will need to prune bad plans when we improve plan space exploration
	- line 101: // TODO: Decouple final output schema from expression evaluation so this copy can be


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWrite.scala (2 lines):
	- line 101: // TODO: [SPARK-36340] Unify check schema filed of DataSource V2 Insert.
	- line 131: // TODO: after partitioning is supported in V2:


python/pyspark/sql/connect/protobuf/functions.py (2 lines):
	- line 50: # TODO: simplify the code when _invoke_function() supports None as input.
	- line 89: # TODO: simplify the code when _invoke_function() supports None as input.


mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala (2 lines):
	- line 432: // TODO: Use ByteBuffer to optimize
	- line 437: // TODO: Use ByteBuffer to optimize


core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala (2 lines):
	- line 556: // TODO SPARK-24819 If the job requires more slots than available (both busy and free
	- line 730: // TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get


sql/core/src/main/scala/org/apache/spark/sql/classic/Dataset.scala (2 lines):
	- line 1259: // TODO handle the metadata?
	- line 1642: // TODO: streaming could be adapted to use this interface


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala (2 lines):
	- line 91: // TODO: operator pushdown.
	- line 105: // TODO (SPARK-27484): we should add the writing node before the plan is analyzed.


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CommitLog.scala (2 lines):
	- line 63: // TODO [SPARK-49462] This validation should be relaxed for a stateless query.
	- line 64: // TODO [SPARK-50653] This validation should be relaxed to support reading


python/pyspark/sql/connect/dataframe.py (2 lines):
	- line 963: # TODO: reuse error handling code in sql.DataFrame.withWatermark()
	- line 1732: # TODO: revisit classic Spark's Dataset.col


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablesExtendedExec.scala (2 lines):
	- line 48: // TODO We need a new listTable overload that takes a pattern string.
	- line 183: // TODO "Created Time", "Last Access", "Partition Statistics"


core/src/main/scala/org/apache/spark/executor/ExecutorExitCode.scala (2 lines):
	- line 62: // TODO: replace external block store with concrete implementation name
	- line 64: // TODO: replace external block store with concrete implementation name


sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingLocalVariableManager.scala (2 lines):
	- line 40: // TODO [SPARK-50785]: Refactor ForStatementExec to use local variables properly.
	- line 67: // TODO: Update logic and comments once stored procedures are introduced.


resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala (2 lines):
	- line 180: // TODO (SPARK-33481) This is a naive way of calculating numMergersDesired for a stage,
	- line 181: // TODO we can use better heuristics to calculate numMergersDesired for a stage.


python/pyspark/ml/torch/distributor.py (2 lines):
	- line 429: "--rdzv_id=0",  # TODO: setup random ID that is gleaned from env variables
	- line 920: # TODO: need to do this in a safe way to avoid issues during concurrent runs


mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala (2 lines):
	- line 29: // TODO: Add aggregate stats (once available).  This will happen after we move the DecisionTree
	- line 92: // TODO: Once the implementation has been moved to this API, then include sufficient


resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverKubernetesCredentialsFeatureStep.scala (2 lines):
	- line 34: // TODO clean up this class, and credentials in general. See also SparkKubernetesClientFactory.
	- line 63: // TODO decide whether or not to apply this step entirely in the caller, i.e. the builder.


sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala (2 lines):
	- line 387: // TODO SPARK-24528 Sort order is currently ignored if buckets are coalesced.
	- line 389: // TODO Currently Spark does not support writing columns sorting in descending order


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala (2 lines):
	- line 36: // TODO: we should have 2 RDDs: an RDD[InternalRow] for row-based scan, an `RDD[ColumnarBatch]` for
	- line 78: // TODO: SPARK-25083 remove the type erasure hack in data source scan


sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala (2 lines):
	- line 121: // TODO: implement hive compatibility as rules.
	- line 229: tmpLocation.toString, // TODO: URI


mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala (2 lines):
	- line 191: // TODO: Add zero/seqOp/combOp option to aggregateMessages. (SPARK-5438)
	- line 638: // TODO: Keep more values in log space, and only exponentiate when needed.


sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java (2 lines):
	- line 258: // TODO: set real configuration map
	- line 275: // TODO: We don't do anything for now, just log this for debugging.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala (2 lines):
	- line 324: // TODO: support nullCount updates for specific outer joins
	- line 344: // TODO: It's error-prone to estimate cardinalities for LeftSemi and LeftAnti based on basic


core/src/main/scala/org/apache/spark/util/Utils.scala (2 lines):
	- line 2566: // TODO: [SPARK-36744] needs to support IO encryption for push-based shuffle
	- line 3253: // FIXME: We copy the stream on the level of bytes to avoid encoding problems.


sql/core/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala (2 lines):
	- line 142: // TODO: we can avoid boxing if future version of avro provide primitive accessors.
	- line 408: // TODO: move the following method in Decimal object on creating Decimal from BigDecimal?


core/src/main/scala/org/apache/spark/util/SizeEstimator.scala (2 lines):
	- line 86: // TODO: Is this arch dependent ?
	- line 151: // TODO: We could use reflection on the VMOption returned ?


common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java (2 lines):
	- line 212: // TODO: [SPARK-33236] Change the message when this service is able to handle NM restart
	- line 775: // TODO we may use a new exception class to include the finalizeShuffleMerge


sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala (2 lines):
	- line 57: // TODO: Instead of returning a default value here, find a way to return a meaningful size
	- line 152: // TODO: Instead of returning a default value here, find a way to return a meaningful size


sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala (2 lines):
	- line 69: // TODO: Move the planner an optimizer into here from SessionState.
	- line 579: // TODO: We use next(), i.e. take the first plan returned by the planner, here for now,


sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala (2 lines):
	- line 368: // TODO: support change column name/dataType/metadata/position.
	- line 801: // TODO: Validate the value


sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStage.scala (2 lines):
	- line 36: // TODO we can potentially include only [[QueryStageExec]] in this class if we make the aggregation
	- line 56: // TODO this is not accurate when there is other physical nodes above QueryStageExec.


core/src/main/scala/org/apache/spark/util/JsonProtocol.scala (2 lines):
	- line 67: // TODO: Remove this file and put JSON serialization into each individual class.
	- line 1361: // TODO: Drop the redundant "Shuffle" since it's inconsistent with related classes.


graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala (2 lines):
	- line 88: // TODO: unpersist the replicated vertices in `replicatedVertexView` but leave the edges alone
	- line 221: // TODO: Because we only have a clustered index on the source vertex ID, we can't filter


python/pyspark/pandas/plot/matplotlib.py (2 lines):
	- line 423: # TODO: this logic is similar to KdePlot. Might have to deduplicate it.
	- line 965: # TODO: check if value of y is plottable


sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java (2 lines):
	- line 99: // FIXME extract the right info type
	- line 403: // TODO: set the correct default fetch size


python/pyspark/ml/connect/tuning.py (2 lines):
	- line 418: # TODO: support pandas dataframe fitting
	- line 481: # TODO:


sql/core/src/main/scala/org/apache/spark/sql/classic/SparkSession.scala (2 lines):
	- line 306: // TODO: use MutableProjection when rowRDD is another DataFrame and the applied
	- line 405: // TODO: use MutableProjection when rowRDD is another DataFrame and the applied


mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala (2 lines):
	- line 560: * TODO: Consider adding check for correct class name.
	- line 811: // TODO: Revert back to the original content if save is not successful.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala (2 lines):
	- line 1044: // TODO: Pruning `UnionLoop`s needs to take into account both the outer `Project` and the inner
	- line 1962: // TODO: non-deterministic predicates could be pushed through some operators that do not change


sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala (2 lines):
	- line 38: // TODO: all commands should look up table from the current catalog. The `SessionCatalog` doesn't
	- line 49: // TODO: create a real SYSTEM catalog to host `TempVariableManager` under the SESSION namespace.


python/pyspark/sql/types.py (2 lines):
	- line 288: # TODO: do this properly like on the scala side
	- line 2463: # TODO: type cast (such as int -> long)


mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala (2 lines):
	- line 90: // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
	- line 215: // TODO: Save using the existing format of Array[Vector] once SPARK-12878 is resolved.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala (2 lines):
	- line 137: // TODO: support whole stage codegen
	- line 295: // TODO: consider large decimal and interval type


mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala (2 lines):
	- line 107: // TODO: Fix the MultiProbe NN Search in SPARK-18454
	- line 277: // TODO: Remove recreateCol logic once SPARK-17154 is resolved.


sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/HiveSQLException.java (2 lines):
	- line 104: // TODO: set correct vendorCode field
	- line 116: // TODO: convert sqlState, etc.


core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala (2 lines):
	- line 237: * TODO: Don't use a global map; these should be tied to a SparkContext (SPARK-13051).
	- line 303: *  TODO: Eventually if this spreads out to more values then using


sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java (2 lines):
	- line 58: * TODO: decimal requiring more than 8 bytes, INT96. Schema mismatch.
	- line 63: * TODO: make this always return ColumnarBatches.


sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionContext.scala (2 lines):
	- line 70: // TODO: After introducing stored procedures, we need to handle the case with multiple
	- line 137: // TODO: Introduce a separate class for different frame types (Script, Stored Procedure,


mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala (2 lines):
	- line 36: // TODO: defaultEvaluator (follow-up PR)
	- line 48: // TODO: defaultEvaluator (follow-up PR)


mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala (2 lines):
	- line 313: // TODO: predicted labels are +1 or -1 for GBT. Need a better way to store this info.
	- line 402: // TODO: Fix this issue for real.


connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala (2 lines):
	- line 69: // TODO is it necessary to have separate configs for initial poll time vs ongoing poll time?
	- line 164: // TODO what about hosts specified by ip vs name


mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala (2 lines):
	- line 370: * TODO: Make the use of zero matrices more storage efficient.
	- line 541: // TODO: Try to use aggregateByKey instead of reduceByKey to get rid of intermediate matrices


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala (2 lines):
	- line 46: // TODO: We should tighten up visibility of the classes here once we clean up Hive coupling.
	- line 176: // TODO: Selective case sensitivity.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala (2 lines):
	- line 309: // TODO supports other expressions
	- line 390: // TODO supports other aggregate functions


core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala (2 lines):
	- line 203: // TODO: support accumulator in multiple UDF
	- line 665: // TODO: This has a race condition if interruption occurs, as completed may still become true.


mllib/src/main/scala/org/apache/spark/ml/param/params.scala (2 lines):
	- line 1026: // TODO: Provide a better method name for Java users.
	- line 1035: // TODO: Provide a better method name for Java users.


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala (2 lines):
	- line 168: // TODO: Restore this from the checkpoint when possible.
	- line 616: * TODO: We do not seem to clear up all values in StateOperatorProgress which are bound to the


core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala (2 lines):
	- line 97: // TODO change this to the streaming version
	- line 241: // TODO implement the streaming version of sampling w/ replacement that doesn't require counts


core/src/main/resources/org/apache/spark/ui/static/dataTables.rowsGroup.js (2 lines):
	- line 212: TODO: Provide function which determines the all <tr>s and <td>s with "rowspan" html-attribute is parent (groupped) for the specified <tr> or <td>. To use in selections, editing or hover styles.
	- line 214: TODO: Feature


sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala (2 lines):
	- line 152: // TODO this check is based on assumptions of callers' behavior but is sufficient for now.
	- line 172: // TODO order by partition size.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala (2 lines):
	- line 364: * TODO: Validate somewhere (in debug mode?) that children are ordered correctly.
	- line 1173: // TODO: currently if the class name ends with "$", we think it's a scala object, there is


sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala (2 lines):
	- line 345: * TODO: Remove this. It is used because CreateTempViewUsing is not a Catalyst plan.
	- line 984: // TODO: what does this message mean?


common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java (2 lines):
	- line 688: // TODO: Add more collation-aware string expressions.
	- line 717: // TODO: Add other collation-aware expressions.


mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala (2 lines):
	- line 409: // TODO: Make standardizeFeatures and standardizeLabel configurable.
	- line 528: // TODO: Make standardizeFeatures and standardizeLabel configurable.


sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala (2 lines):
	- line 71: // TODO: Add query id to the log.
	- line 191: // TODO: Better handling (e.g. retries) on exceptions like EOFException to avoid


sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java (1 line):
	- line 28: * TODO: merge this into parquet-mr.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala (1 line):
	- line 212: // TODO: create a generic representation for views, after we add view support to v2 catalog. For now


python/pyspark/pandas/supported_api_gen.py (1 line):
	- line 395: # TODO: Take into account that this function can create links incorrectly


sql/core/src/main/scala/org/apache/spark/sql/execution/python/streaming/TransformWithStateInPySparkExec.scala (1 line):
	- line 213: // TODO SPARK-50180: check if we can return true only if actual timers are registered,


mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala (1 line):
	- line 103: // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size)


build/sbt-launch-lib.bash (1 line):
	- line 7: # TODO - Should we merge the main SBT script with this library?


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala (1 line):
	- line 229: // TODO: support whole stage codegen too


sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala (1 line):
	- line 361: // TODO: move these into BytesToBytesMap


mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala (1 line):
	- line 439: // TODO Generate RDD[Vector] from multivariate distributions.


core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala (1 line):
	- line 36: // TODO: Making Buffer a real trait would be a better abstraction, but adds some complexity.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala (1 line):
	- line 500: // TODO: Optimise this logic.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushDownLeftSemiAntiJoin.scala (1 line):
	- line 183: * TODO:


sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala (1 line):
	- line 37: * TODO: Merge this file with [[org.apache.spark.ml.util.SchemaUtils]].


python/pyspark/ml/param/_shared_params_code_gen.py (1 line):
	- line 93: # TODO: How to correctly inherit instance attributes?


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala (1 line):
	- line 33: // TODO: Expand distinctKeys for redundant aliases on the same expression


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala (1 line):
	- line 153: // TODO: Make CollectSet collation aware


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala (1 line):
	- line 83: // TODO: ideally Aggregate should also be handled here, but its grouping expressions are


project/SparkBuild.scala (1 line):
	- line 483: // TODO: move this to its upstream project.


common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java (1 line):
	- line 131: // TODO: the strategy of `CountMinSketch` looks more advanced, should we follow it here?


connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala (1 line):
	- line 111: // TODO (SPARK-37973) Directly call super.getDefaultReadLimit when scala issue 12523 is fixed


sql/connect/common/src/main/scala/org/apache/spark/sql/connect/Dataset.scala (1 line):
	- line 732: // TODO we need to have a proper way of stabilizing the input data. The current approach does


sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala (1 line):
	- line 1015: // TODO: Now, always set environmentContext to null. In the future, we should avoid setting


mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala (1 line):
	- line 82: // TODO: defaultEvaluator (follow-up PR)


mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala (1 line):
	- line 355: // TODO: consolidate aggregates for summary statistics


python/pyspark/ml/util.py (1 line):
	- line 1109: # TODO: We need to handle `RFormulaModel.pipelineModel` here after Pyspark RFormulaModel


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreChangelog.scala (1 line):
	- line 482: // TODO: reuse the key buffer and value buffer across records.


core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala (1 line):
	- line 268: // TODO: When visible is false(the task had failed), we should be asking the block managers to


core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala (1 line):
	- line 96: // TODO: a non-blocking TransportClientFactory.createClient in future


sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java (1 line):
	- line 286: // TODO: consider pushing this in ColumnVector by adding a readBytes with a stride.


core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriterUtils.scala (1 line):
	- line 109: // TODO: these don't seem like the right abstractions.


sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala (1 line):
	- line 71: // TODO ideally, we should get the output data ready first and then


mllib/src/main/scala/org/apache/spark/ml/Predictor.scala (1 line):
	- line 49: // TODO: Support casting Array[Double] and Array[Float] to Vector when FeaturesType = Vector


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala (1 line):
	- line 174: // TODO: to reuse the existing partition parameters for those partition specific options


core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala (1 line):
	- line 116: *                            (TODO: add a new type like `ExecutorDecommissionInfo` for the


core/src/main/resources/org/apache/spark/ui/static/utils.js (1 line):
	- line 279: // TODO: Reused stacktrace-details* style for convenience, but it's not really a stacktrace


python/pyspark/ml/functions.py (1 line):
	- line 774: # TODO: adjust return type hint when Iterator[Union[pd.Series, pd.DataFrame]] is supported


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/DeserializerBuildHelper.scala (1 line):
	- line 358: // TODO (hvanhovell) this is can be improved.


sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java (1 line):
	- line 67: * TODO: move this to the parquet-mr project. There are performance benefits of doing it


sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeShuffleWithLocalRead.scala (1 line):
	- line 67: // TODO: this method assumes all shuffle blocks are the same data size. We should calculate the


mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala (1 line):
	- line 50: // TODO: remove this function and use eigs in breeze when switching breeze version


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala (1 line):
	- line 272: // TODO: Make this work for Stream-Stream joins, where we use multiple


python/pyspark/ml/torch/log_communication.py (1 line):
	- line 182: # TODO:


resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala (1 line):
	- line 203: // TODO: it would be nicer to just make sure there are no null commands here


common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java (1 line):
	- line 1196: return 0; // TODO: Fix this behaviour (SPARK-48284)


python/pyspark/worker.py (1 line):
	- line 2381: # TODO: Remove the following two lines and use `Process.pid()` when we drop JDK 8.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSetVariable.scala (1 line):
	- line 55: // TODO: we need to group by the qualified variable name once other catalogs support it.


core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala (1 line):
	- line 104: // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCWriteBuilder.scala (1 line):
	- line 39: // TODO (SPARK-32595): do truncate and append atomically.


sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala (1 line):
	- line 64: // TODO: Finish input output types.


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala (1 line):
	- line 816: // TODO (SPARK-26174): disallow it with a config.


core/src/main/scala/org/apache/spark/SparkContext.scala (1 line):
	- line 2395: // TODO: Cache.stop()?


core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala (1 line):
	- line 54: // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and


mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala (1 line):
	- line 70: // TODO: Support dot operator.


core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java (1 line):
	- line 407: // TODO: try to find space in previous pages


core/src/main/scala/org/apache/spark/Dependency.scala (1 line):
	- line 183: // TODO: SPARK-35547: Push based shuffle is currently unsupported for Barrier stages


python/pyspark/sql/connect/udf.py (1 line):
	- line 165: # TODO: PythonEvalType.SQL_BATCHED_UDF


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala (1 line):
	- line 804: // TODO: Merge this and `NamedLambdaVariable`.


sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala (1 line):
	- line 259: loadPath: String, // TODO URI


python/pyspark/ml/classification.py (1 line):
	- line 3864: # TODO: need to set metadata


sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java (1 line):
	- line 140: // TODO: check defaults: maxTimeout, keepalive, maxBodySize, bodyReceiveDuration, etc.


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableCapabilityCheck.scala (1 line):
	- line 47: // TODO: check STREAMING_WRITE capability. It's not doable now because we don't have a


python/pyspark/sql/connect/client/artifact.py (1 line):
	- line 248: # TODO: Support directory path.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala (1 line):
	- line 256: // TODO (SPARK-44754): we should handle all special cases here.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala (1 line):
	- line 211: // TODO: SPARK-48701: PandasMode (all collations)


python/pyspark/ml/tuning.py (1 line):
	- line 115: # TODO: duplicate evaluator to take extra params from input


core/src/main/resources/org/apache/spark/ui/static/historypage.js (1 line):
	- line 142: // TODO: Replace hasOwnProperty with prototype.hasOwnProperty after we find it's safe to do.


sql/core/src/main/scala/org/apache/spark/sql/execution/InsertSortForLimitAndOffset.scala (1 line):
	- line 36: * TODO: add a order preserving mode in the shuffle reader.


python/pyspark/sql/connect/proto/commands_pb2.pyi (1 line):
	- line 1199: """TODO: Consider reusing Explain from AnalyzePlanRequest message.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala (1 line):
	- line 490: * TODO: We could make nullability more precise in foldable cases (e.g., literal input).


sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala (1 line):
	- line 145: // TODO: introduce a user defined type for serialized R data.


core/src/main/scala/org/apache/spark/status/AppStatusListener.scala (1 line):
	- line 645: // TODO: can this really happen?


sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala (1 line):
	- line 53: // TODO: consider large decimal and interval type


core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala (1 line):
	- line 132: // TODO: Make it tighter.


python/pyspark/sql/connect/client/core.py (1 line):
	- line 1985: # TODO: Fix the code: change thread-local `ml_caches` to global `ml_caches`.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveLateralColumnAliasReference.scala (1 line):
	- line 286: // TODO: this condition only guarantees to keep the shape after the plan has


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala (1 line):
	- line 84: // TODO: Disallow updating the metadata once we remove the compatibility flag.


sql/api/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala (1 line):
	- line 100: // TODO check if this works.


sql/api/src/main/java/org/apache/spark/sql/types/SQLUserDefinedType.java (1 line):
	- line 29: // TODO: Should I used @Documented ?


sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java (1 line):
	- line 419: // TODO: best perf?


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala (1 line):
	- line 145: // TODO: implement support for decimal/datetime/interval types


sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala (1 line):
	- line 347: // TODO: Shall use TIMESTAMPLOCALTZ_TYPE, keep AS-IS now for


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCScanBuilder.scala (1 line):
	- line 172: // TODO (SPARK-32593): JDBC support nested column and nested column pruning.


sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala (1 line):
	- line 375: // TODO: Handle BroadcastPartitioning.


core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala (1 line):
	- line 165: // TODO: differentiate between the intention to cache an RDD and whether it's actually cached


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala (1 line):
	- line 658: // TODO: we should do this check earlier when we have capability API.


sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala (1 line):
	- line 181: // TODO: match more plan nodes here.


sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala (1 line):
	- line 338: // TODO: this is not right for DecimalType with precision > 18


mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala (1 line):
	- line 303: // TODO: When we add a generic Boosting class, handle transform there?  SPARK-7129


core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala (1 line):
	- line 91: // TODO: [SPARK-25247] add extra conf to RDDBarrier, e.g., timeout.


mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala (1 line):
	- line 32: // TODO: Move the utility methods to SQL.


sql/core/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala (1 line):
	- line 97: // TODO Removes this check once `FileFormat` gets a general file filtering interface method.


sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/util/HadoopCompressionCodec.java (1 line):
	- line 44: // TODO supports ZStandardCodec


python/run-tests.py (1 line):
	- line 64: # TODO: revisit for Scala 2.13


core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala (1 line):
	- line 116: // TODO: SPARK-48789: the naming is confusing since this does not really reflect the whole


connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala (1 line):
	- line 146: // TODO (SPARK-37973) Directly call super.getDefaultReadLimit when scala issue 12523 is fixed


sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala (1 line):
	- line 38: // TODO: move SchemaPruning into catalyst


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala (1 line):
	- line 35: * TODO: This can be made generic to generate any type of random distribution, or any type of


mllib/src/main/scala/org/apache/spark/ml/tree/impl/BaggedPoint.scala (1 line):
	- line 66: // TODO: implement weighted bootstrapping


mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/PearsonCorrelation.scala (1 line):
	- line 64: // TODO remove once covariance numerical issue resolved.


mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala (1 line):
	- line 438: * TODO: Change to always do bootstrapping (simpler).  SPARK-7130


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala (1 line):
	- line 146: // TODO: remove operators from this list as support for avro encoding is added


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/EliminateResolvedHint.scala (1 line):
	- line 88: // TODO revisit this logic:


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StateTypesEncoderUtils.scala (1 line):
	- line 109: // TODO: validate places that are trying to encode the key and check if we can eliminate/


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala (1 line):
	- line 1077: * TODO: remove this after we completely make hive as a data source.


mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala (1 line):
	- line 52: // TODO: computation of statistics may take seconds, so save it to KMeansModel in training


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala (1 line):
	- line 86: }.asInstanceOf[A] // Kind of a hack, but safe.  TODO: Tighten return type when possible.


resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala (1 line):
	- line 127: * TODO: If the OOM is not recoverable by rescheduling it on different node, then do


sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala (1 line):
	- line 728: // TODO: support BooleanType, DateType and TimestampType


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala (1 line):
	- line 146: // TODO: Non-catalog paths for DSV2 are currently not well defined.


python/pyspark/pandas/sql_processor.py (1 line):
	- line 296: # TODO: use a string builder


sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TypedAggregateExpression.scala (1 line):
	- line 112: // TODO: merge these 2 implementations once we refactor the `AggregateFunction` interface.


python/pyspark/sql/session.py (1 line):
	- line 1580: # TODO: Apply the logic below when self._jconf.arrowPySparkEnabled() is True


python/pyspark/sql/connect/session.py (1 line):
	- line 709: # TODO: Beside the validation on number of columns, we should also check


mllib/src/main/scala/org/apache/spark/ml/tree/impl/DTStatsAggregator.scala (1 line):
	- line 169: // TODO: Test BLAS.axpy


resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala (1 line):
	- line 295: // TODO: with dynamic allocation off, handle edge cases if we end up with more running


core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala (1 line):
	- line 329: // TODO: Only track used files (SPARK-31974)


core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala (1 line):
	- line 163: // FIXME How to handle the following cases?


sql/api/src/main/scala/org/apache/spark/sql/Row.scala (1 line):
	- line 79: // TODO: Improve the performance of this if used in performance critical part.


core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala (1 line):
	- line 186: // TODO: stop combining if we find that the reduction factor isn't high


mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala (1 line):
	- line 338: * TODO: SPARK-20443 - expose blockSize as a param?


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala (1 line):
	- line 806: // TODO (SPARK-44225): Move this into analyzer


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala (1 line):
	- line 70: * TODO: we should just have different traits for the different formats.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala (1 line):
	- line 894: // TODO: support whole stage codegen


python/pyspark/ml/torch/data.py (1 line):
	- line 91: # TODO: we can optimize this further by directly extracting


python/pyspark/ml/feature.py (1 line):
	- line 5854: TODO: Future extensions: The following functionality is planned for the future:


python/pyspark/pandas/_typing.py (1 line):
	- line 44: # TODO: use the actual type parameters.


sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/StreamingQueryListenerHelper.scala (1 line):
	- line 85: // TODO: Reuse the same method in StreamingForeachBatchHelper to avoid duplication.


sql/catalyst/src/main/scala/org/apache/spark/sql/internal/connector/SupportsStreamingUpdateAsAppend.scala (1 line):
	- line 30: // TODO: design an official API for streaming output mode UPDATE which can do the upsert


core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala (1 line):
	- line 65: // TODO: We should consider increasing the number of this parameter over time


mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala (1 line):
	- line 327: // TODO: Implement this method.


core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala (1 line):
	- line 39: // TODO: Make this return RDD[Product2[K, C]] or have some way to configure mutable pairs


core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala (1 line):
	- line 37: * TODO: Cache the hash values of each key? java.util.HashMap does that.


mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala (1 line):
	- line 333: // TODO: use a numerically stable approach to estimate cost


resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala (1 line):
	- line 65: // TODO: Currently, task to container is computed once (TaskSetManager) - which need not be


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala (1 line):
	- line 69: // TODO: improve error message for java bean encoder.


python/pyspark/cloudpickle/cloudpickle.py (1 line):
	- line 1342: # TODO: decorrelate reducer_override (which is tied to CPython's


core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala (1 line):
	- line 611: // TODO: Support distributing R packages with standalone cluster


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala (1 line):
	- line 95: // SPARK-51779 TODO: Support stream-stream joins with virtual column families


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SerializerBuildHelper.scala (1 line):
	- line 483: // TODO replace this with `createSerializerForPrimitiveArray` as


core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala (1 line):
	- line 51: // TODO: are we sure we need to use a global lock in the following methods?


connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPool.scala (1 line):
	- line 132: // TODO: revisit the relation between CacheKey and kafkaParams - for now it looks a bit weird


sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/OperationType.java (1 line):
	- line 46: // TODO: replace this with a Map?


core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala (1 line):
	- line 375: // TODO: We can sort these blocks based on some policy (LRU/blockSize etc)


sql/core/src/main/scala/org/apache/spark/sql/classic/SQLContext.scala (1 line):
	- line 75: // TODO: move this logic into SparkSession


resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala (1 line):
	- line 25: // TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware !


sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEPropagateEmptyRelation.scala (1 line):
	- line 91: // TODO: we can return the original query plan before broadcast.


mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala (1 line):
	- line 221: // TODO: In the future, also support normalizing by tree.rootNode.impurityStats.count?


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala (1 line):
	- line 79: // TODO: currently we don't support LCA in `groupingExpressions` yet.


mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala (1 line):
	- line 189: // TODO: Allow the user to vary the number of bins using a setBins method in


python/pyspark/mllib/linalg/__init__.py (1 line):
	- line 1555: # TODO: More efficient implementation:


mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala (1 line):
	- line 157: // TODO: Calculate memory usage more precisely.


mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala (1 line):
	- line 1086: // TODO: Generalize PeriodicGraphCheckpointer and use it here.


core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala (1 line):
	- line 60: // TODO: this currently doesn't work on P other than Tuple2!


sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEShuffleReadExec.scala (1 line):
	- line 59: // TODO this check is based on assumptions of callers' behavior but is sufficient for now.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala (1 line):
	- line 392: // TODO support multi column NULL-aware anti join in future.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala (1 line):
	- line 55: // TODO: specialize it


core/src/main/scala/org/apache/spark/TestUtils.scala (1 line):
	- line 62: * TODO: See if we can move this to the test codebase by specifying


sql/core/src/main/scala/org/apache/spark/sql/classic/DataFrameNaFunctions.scala (1 line):
	- line 189: * TODO: This can be optimized to use broadcast join when replacementMap is large.


sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala (1 line):
	- line 351: // TODO: revisit it. If left side is much smaller than the right side, it may be better


core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala (1 line):
	- line 193: // TODO: If we add ability to submit multiple jars they should also be added here


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/DecorrelateInnerQuery.scala (1 line):
	- line 435: // TODO: add a more general rule to optimize join with OneRowRelation.


mllib/src/main/scala/org/apache/spark/ml/feature/UnivariateFeatureSelector.scala (1 line):
	- line 453: // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size)


python/pyspark/sql/udf.py (1 line):
	- line 363: # TODO: PythonEvalType.SQL_BATCHED_UDF


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala (1 line):
	- line 103: // TODO SPARK-50180: check if we can return true only if actual timers are registered,


python/pyspark/core/status.py (1 line):
	- line 109: # TODO: fetch them in batch for better performance


sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala (1 line):
	- line 178: // TODO: revisit this. We can consider reordering predicates as well.


common/utils/src/main/scala/org/apache/spark/storage/StorageLevel.scala (1 line):
	- line 47: // TODO: Also add fields for caching priority, dataset ID, and flushing.


core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala (1 line):
	- line 115: // TODO: It would be nice to add a shutdown hook here that explains why the output is


core/src/main/scala/org/apache/spark/rdd/LocalRDDCheckpointData.scala (1 line):
	- line 49: // must cache any missing partitions. TODO: avoid running another job here (SPARK-8582).


python/pyspark/pandas/indexing.py (1 line):
	- line 663: self._psdf_or_psser._column_label, scol  # TODO: dtype?


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/UnsupportedExpressionInOperatorValidation.scala (1 line):
	- line 62: // TODO: check if we are resolving a lateral join condition once lateral join is supported.


sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/Operation.java (1 line):
	- line 300: // TODO: make this abstract and implement in subclasses.


sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteEventsManager.scala (1 line):
	- line 31: // TODO: Make this configurable


sql/catalyst/src/main/java/org/apache/spark/sql/connector/util/V2ExpressionSQLBuilder.java (1 line):
	- line 128: // TODO supports other expressions


core/src/main/scala/org/apache/spark/executor/Executor.scala (1 line):
	- line 739: // TODO: do not serialize value twice


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala (1 line):
	- line 33: // TODO: This is boxing.  We should specialize.


mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala (1 line):
	- line 158: // TODO: duplicate evaluator to take extra params from input


common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java (1 line):
	- line 60: // TODO: right now this behaves like the SASL backend, because when executors start up


python/pyspark/ml/linalg/__init__.py (1 line):
	- line 1338: # TODO: More efficient implementation:


core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala (1 line):
	- line 91: // TODO: Right now, each split sends along its full data, even if later down the RDD chain it gets


sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowSerializer.scala (1 line):
	- line 242: // TODO throw better errors on class cast exceptions.


core/src/main/scala/org/apache/spark/SparkConf.scala (1 line):
	- line 721: * TODO: consolidate it with `ConfigBuilder.withAlternative`.


sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala (1 line):
	- line 61: * TODO: AggregateMode should have only two modes: Update and Merge, AggregateExpression


mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala (1 line):
	- line 808: // TODO: Implement this method.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveColumnDefaultInCommandInputQuery.scala (1 line):
	- line 46: // TODO (SPARK-43752): support v2 write commands as well.


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala (1 line):
	- line 101: // TODO: The obvious generalization of this logic to multiple stages won't work. It's


sql/core/src/main/scala/org/apache/spark/sql/classic/Catalog.scala (1 line):
	- line 287: // TODO: The SHOW FUNCTIONS should tell us the function type (built-in, temp, persistent) and


resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocatorNodeHealthTracker.scala (1 line):
	- line 139: // TODO - We need to update once Hadoop changes -


sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala (1 line):
	- line 32: * TODO reusing the CompletionIterator?


sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/CLIServiceClient.java (1 line):
	- line 40: // TODO: provide STATIC default value


sql/connect/common/src/main/scala/org/apache/spark/sql/connect/columnNodeSupport.scala (1 line):
	- line 174: // TODO we should probably 'just' detect this particular scenario


streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala (1 line):
	- line 428: * TODO Should poll the executor number and wait for executors according to


sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java (1 line):
	- line 46: // TODO: Use a more efficient format which doesn't depend on unsafe array.


sql/api/src/main/scala/org/apache/spark/sql/types/UpCastRule.scala (1 line):
	- line 45: // TODO: allow upcast from int/double/decimal to char/varchar of sufficient length


core/src/main/scala/org/apache/spark/scheduler/ExecutorResourcesAmounts.scala (1 line):
	- line 131: * TODO: as we consistently allocate addresses beginning from the "small" address, it can


python/pyspark/shuffle.py (1 line):
	- line 71: # TODO: support windows


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/V2ExpressionUtils.scala (1 line):
	- line 178: // TODO: handle functions defined in Scala too - in Scala, even if a


python/pyspark/pandas/datetimes.py (1 line):
	- line 51: # TODO: Hit a weird exception


sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java (1 line):
	- line 269: //TODO: Since OperationLog is moved to package o.a.h.h.ql.session,


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StreamStreamJoinStatePartitionReader.scala (1 line):
	- line 108: // TODO after we persistent the StateStoreCheckpointID to the commit log, we can get it from


connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumer.scala (1 line):
	- line 103: // TODO if the buffer was kept around as a random-access structure,


sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala (1 line):
	- line 42: * TODO: merge this with PruneFileSourcePartitions after we completely make hive as a data source.


sql/connect/common/src/main/scala/org/apache/spark/sql/connect/UdfToProtoUtils.scala (1 line):
	- line 89: f.inputEncoders.map(e => agnosticEncoderFor(e.get)) // TODO support Any and UnboundRow.


mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala (1 line):
	- line 421: // TODO: SPARK-15785 Support users supplied initial GMM.


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ContinuousScanExec.scala (1 line):
	- line 39: // TODO: unify the equal/hashCode implementation for all data source v2 query plans.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/BooleanEqualityTypeCoercion.scala (1 line):
	- line 50: // TODO: Maybe these rules should go into the optimizer.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/broadcastMode.scala (1 line):
	- line 38: // TODO: pack the UnsafeRows into single bytes array.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala (1 line):
	- line 47: // TODO [SPARK-50785]: Uncomment this when For Statement starts properly using local vars.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ProjectResolver.scala (1 line):
	- line 83: // TODO: This validation function does a post-traversal. This is discouraged in


mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala (1 line):
	- line 57: // TODO: skip computation if both withMean and withStd are false


core/src/main/scala/org/apache/spark/BarrierCoordinator.scala (1 line):
	- line 55: // TODO SPARK-25030 Create a Timer() in the mainClass submitted to SparkSubmit makes it unable to


sql/api/src/main/scala/org/apache/spark/sql/ForeachWriter.scala (1 line):
	- line 96: // TODO: Move this to org.apache.spark.sql.util or consolidate this with batch API.


core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js (1 line):
	- line 287: // Link each graph to the corresponding stage page (TODO: handle stage attempts)


sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/TableSchema.java (1 line):
	- line 40: // TODO: remove this constructor


sql/core/src/main/scala/org/apache/spark/sql/execution/command/v2/V2CommandStrategy.scala (1 line):
	- line 28: // TODO: move v2 commands to here which are not data source v2 related.


core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala (1 line):
	- line 261: // TODO: When we drop JDK 8, we can just use workerProcess.pid()


mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala (1 line):
	- line 394: TODO: Add simplex constraints to allow alpha in (0,1).


resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala (1 line):
	- line 1159: // TODO: it would be nicer to just make sure there are no null commands here


common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java (1 line):
	- line 1604: // TODO: Add more collation-aware UTF8String operations here.


sql/core/src/main/java/org/apache/parquet/filter2/predicate/SparkFilterApi.java (1 line):
	- line 29: * TODO (PARQUET-1809): This is a temporary workaround; it is intended to be moved to Parquet.


sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkParserUtils.scala (1 line):
	- line 39: // The following 2 lines are exactly what MySQL does TODO: why do we do this?


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/MicroBatchScanExec.scala (1 line):
	- line 42: // TODO: unify the equal/hashCode implementation for all data source v2 query plans.


core/src/main/scala/org/apache/spark/shuffle/ShuffleReader.scala (1 line):
	- line 29: * TODO: Add this back when we make the ShuffleReader a developer API that others can implement


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala (1 line):
	- line 48: // TODO: unify the equal/hashCode implementation for all data source v2 query plans.


core/src/main/resources/org/apache/spark/ui/static/executorspage.js (1 line):
	- line 286: // TODO: Replace hasOwnProperty with prototype.hasOwnProperty after we find it's safe to do.


sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala (1 line):
	- line 227: // TODO: It's possible that only one skewed join in the query plan leads to extra shuffles and


sql/core/src/main/java/org/apache/spark/sql/execution/UnsafeKVExternalSorter.java (1 line):
	- line 113: // TODO: It's possible to change UnsafeInMemorySorter to have multiple entries with same key,


core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala (1 line):
	- line 62: // TODO: Don't use Java serialization, use a more cross-version compatible serialization format.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala (1 line):
	- line 858: // TODO: remove this `retainedHiveConfigs` after the `RelationConversions` is moved to


common/network-common/src/main/java/org/apache/spark/network/buffer/ManagedBuffer.java (1 line):
	- line 49: // TODO: Deprecate this, usage may require expensive memory mapping or allocation.


mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala (1 line):
	- line 180: // TODO: duplicate evaluator to take extra params from input


sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java (1 line):
	- line 115: // TODO: this is extremely expensive.


mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala (1 line):
	- line 111: // TODO: Override methods such as aggregate, which only requires one Spark job.


mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala (1 line):
	- line 270: // TODO: When we add a generic Bagging class, handle transform there.  SPARK-7128


core/src/main/scala/org/apache/spark/SecurityManager.scala (1 line):
	- line 353: // TODO: this really should be abstracted somewhere else.


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala (1 line):
	- line 358: // TODO: Throw when too much is given.


sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala (1 line):
	- line 150: // TODO: Remove copy logic.


python/pyspark/streaming/dstream.py (1 line):
	- line 368: # TODO: uncomment this until we have ssc.pickleFileStream()


sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala (1 line):
	- line 325: // TODO: SPARK-51957: Fix partition column and EMTPY_DATA_SCHEMA naming conflict


mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala (1 line):
	- line 384: // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size)


sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala (1 line):
	- line 49: * TODO: implement the read logic.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala (1 line):
	- line 50: * TODO: RIGHT NOW ONLY ONE PLAN IS RETURNED EVER...


sql/api/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala (1 line):
	- line 149: // TODO: we should only collect properties that have getter and setter. However, some tests


sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/FetchOrientation.java (1 line):
	- line 46: // TODO: Should this really default to FETCH_NEXT?


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2Writes.scala (1 line):
	- line 142: // TODO: we should do this check earlier when we have capability API.


sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala (1 line):
	- line 1379: // TODO this might be too complex for no good reason. It might


core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala (1 line):
	- line 166: // TODO: This is expensive because it computes the RDD again unnecessarily (SPARK-8582)


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala (1 line):
	- line 183: // TODO: Check that options from the resolved relation match the relation that we are


sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala (1 line):
	- line 375: // TODO: Support for loading the jars from an already downloaded location.


graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala (1 line):
	- line 68: // TODO: use a fixed random seed


sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala (1 line):
	- line 145: // TODO When state store checkpoint format V2 is used, after state store checkpoint ID is


common/utils/src/main/scala/org/apache/spark/util/SparkClassUtils.scala (1 line):
	- line 86: // TODO: the value returned here isn't even quite right; it returns simple names


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala (1 line):
	- line 212: // TODO: we should have a better separation of row based and batch based scan, so that we


streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala (1 line):
	- line 412: // TODO: merge callsites with scopes so we can just reuse the code there


core/src/main/scala/org/apache/spark/deploy/Client.scala (1 line):
	- line 83: // TODO: We could add an env variable here and intercept it in `sc.addJar` that would


core/src/main/scala/org/apache/spark/scheduler/Task.scala (1 line):
	- line 103: // TODO SPARK-24874 Allow create BarrierTaskContext based on partitions, instead of whether


mllib/src/main/scala/org/apache/spark/ml/util/DatasetUtils.scala (1 line):
	- line 105: case _ => // TODO: there is no RegressorParams, maybe add it in the future?


mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala (1 line):
	- line 829: * TODO figure out return type.


python/pyspark/sql/worker/analyze_udtf.py (1 line):
	- line 281: # TODO: Remove the following two lines and use `Process.pid()` when we drop JDK 8.


streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala (1 line):
	- line 98: // events (SPARK-12140). TODO Once SPARK-12140 is resolved we should set it to true.


launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java (1 line):
	- line 315: // TODO: change to the correct directory once the assembly build is changed.


sql/connect/common/src/main/scala/org/apache/spark/sql/connect/SparkSession.scala (1 line):
	- line 691: // TODO: implements all methods mentioned in the scaladoc of [[SparkSession]]


mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala (1 line):
	- line 204: // TODO: Fix this issue for real.


mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala (1 line):
	- line 118: // TODO: Define a rigorous naming scheme.


mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala (1 line):
	- line 337: // TODO: When we add a generic Bagging class, handle transform there: SPARK-7128


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala (1 line):
	- line 460: * TODO: Look to merge this rule with RewritePredicateSubquery.


core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala (1 line):
	- line 127: * TODO do we need a timeout parameter?


python/pyspark/pandas/mlflow.py (1 line):
	- line 55: # TODO: do something smarter, for example when there is a sklearn.Classifier (it should


common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlock.java (1 line):
	- line 38: // TODO: StorageLevel is serialized separately in here because StorageLevel is not available in


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala (1 line):
	- line 185: * TODO: Currently we don't allow deep correlation. Also, we don't allow mixing of


graphx/src/main/scala/org/apache/spark/graphx/package.scala (1 line):
	- line 34: // TODO: Consider using Char.


core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala (1 line):
	- line 48: // TODO: use this to identify internal task metrics instead of encoding it in the name


sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala (1 line):
	- line 130: // TODO: Check if the paths coming in are already qualified and simplify.


mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala (1 line):
	- line 286: * TODO: We need to clean it up by separating the logic of regularization out


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala (1 line):
	- line 250: * TODO: we should remove the special handling for hive tables after completely making hive as a


mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala (1 line):
	- line 46: // TODO: Allow different IDF formulations.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AggregateResolver.scala (1 line):
	- line 109: // TODO: This validation function does a post-traversal. This is discouraged in single-pass


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceExceptWithFilter.scala (1 line):
	- line 80: // TODO: This can be further extended in the future.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala (1 line):
	- line 1217: // TODO: We need to figure out how these methods interact with our data source


sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala (1 line):
	- line 166: // TODO: if you move this into the closure it reverts to the default values.


mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala (1 line):
	- line 137: require(doc.nonEmpty) // TODO: more rigorous on doc


sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDataSource.scala (1 line):
	- line 113: // TODO (SPARK-27483): we should move this fallback logic to an analyzer rule.


sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala (1 line):
	- line 268: // TODO more implicit class for literal?


sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala (1 line):
	- line 181: // TODO work around for set the log output to console, because the HiveContext