Sources/SparkConnect/DataFrameWriterV2.swift (79 lines of code) (raw):
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
import Foundation
/// Interface used to write a ``DataFrame`` to external storage using the v2 API.
public actor DataFrameWriterV2: Sendable {
let tableName: String
let df: DataFrame
var provider: String? = nil
var extraOptions: CaseInsensitiveDictionary = CaseInsensitiveDictionary()
var tableProperties: CaseInsensitiveDictionary = CaseInsensitiveDictionary()
var partitioningColumns: [Spark_Connect_Expression] = []
var clusteringColumns: [String]? = nil
init(_ table: String, _ df: DataFrame) {
self.tableName = table
self.df = df
}
/// Specifies a provider for the underlying output data source. Spark's default catalog supports
/// "orc", "json", etc.
/// - Parameter provider: <#provider description#>
public func using(_ provider: String) -> DataFrameWriterV2 {
self.provider = provider
return self
}
/// Adds an output option for the underlying data source.
/// - Parameters:
/// - key: A key string.
/// - value: A value string.
/// - Returns: A `DataFrameWriter`.
public func option(_ key: String, _ value: String) -> DataFrameWriterV2 {
self.extraOptions[key] = value
return self
}
/// Add a table property.
/// - Parameters:
/// - property: A property name.
/// - value: A property value.
public func tableProperty(property: String, value: String) -> DataFrameWriterV2 {
self.tableProperties[property] = value
return self
}
/// Partition the output table created by `create`, `createOrReplace`, or `replace` using the
/// given columns or transforms.
/// - Parameter columns: Columns to partition
/// - Returns: A ``DataFrameWriterV2``.
public func partitionBy(_ columns: String...) -> DataFrameWriterV2 {
self.partitioningColumns = columns.map {
var expr = Spark_Connect_Expression()
expr.expressionString = $0.toExpressionString
return expr
}
return self
}
/// Clusters the output by the given columns on the storage. The rows with matching values in the
/// specified clustering columns will be consolidated within the same group.
/// - Parameter columns: Columns to cluster
/// - Returns: A ``DataFrameWriterV2``.
public func clusterBy(_ columns: String...) -> DataFrameWriterV2 {
self.clusteringColumns = columns
return self
}
/// Create a new table from the contents of the data frame.
public func create() async throws {
try await executeWriteOperation(.create)
}
/// Replace an existing table with the contents of the data frame.
public func replace() async throws {
try await executeWriteOperation(.replace)
}
/// Create a new table or replace an existing table with the contents of the data frame.
public func createOrReplace() async throws {
try await executeWriteOperation(.createOrReplace)
}
/// Append the contents of the data frame to the output table.
public func append() async throws {
try await executeWriteOperation(.append)
}
/// Overwrite rows matching the given filter condition with the contents of the ``DataFrame`` in the
/// output table.
/// - Parameter condition: A filter condition.
public func overwrite(condition: String) async throws {
try await executeWriteOperation(.overwrite)
}
/// Overwrite all partition for which the ``DataFrame`` contains at least one row with the contents
/// of the data frame in the output table.
/// This operation is equivalent to Hive's `INSERT OVERWRITE ... PARTITION`, which replaces
/// partitions dynamically depending on the contents of the ``DataFrame``.
public func overwritePartitions() async throws {
try await executeWriteOperation(.overwritePartitions)
}
private func executeWriteOperation(_ mode: WriteOperationV2.Mode) async throws {
var write = WriteOperationV2()
let plan = await self.df.getPlan() as! Plan
write.input = plan.root
write.tableName = self.tableName
if let provider = self.provider {
write.provider = provider
}
write.partitioningColumns = self.partitioningColumns
if let clusteringColumns = self.clusteringColumns {
write.clusteringColumns = clusteringColumns
}
for option in self.extraOptions.toStringDictionary() {
write.options[option.key] = option.value
}
for property in self.tableProperties.toStringDictionary() {
write.tableProperties[property.key] = property.value
}
write.mode = mode
var command = Spark_Connect_Command()
command.writeOperationV2 = write
try await df.spark.client.execute(df.spark.sessionID, command)
}
}