sdks/python/apache_beam/yaml/standard_io.yaml (317 lines of code) (raw):
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This file enumerates the various IOs that are available by default as
# top-level transforms in Beam's YAML.
#
# Note that there may be redundant implementations. In these cases the specs
# should be kept in sync.
# TODO(yaml): See if this can be enforced programmatically.
# BigQuery
- type: renaming
transforms:
'ReadFromBigQuery': 'ReadFromBigQuery'
'WriteToBigQuery': 'WriteToBigQuery'
config:
mappings:
'ReadFromBigQuery':
query: 'query'
table: 'table_spec'
fields: 'selected_fields'
row_restriction: 'row_restriction'
'WriteToBigQuery':
table: 'table'
create_disposition: 'create_disposition'
write_disposition: 'write_disposition'
error_handling: 'error_handling'
# TODO(https://github.com/apache/beam/issues/30058): Required until autosharding support is fixed
num_streams: 'num_streams'
underlying_provider:
type: beamJar
transforms:
'ReadFromBigQuery': 'beam:schematransform:org.apache.beam:bigquery_storage_read:v1'
'WriteToBigQuery': 'beam:schematransform:org.apache.beam:bigquery_storage_write:v2'
config:
gradle_target: 'sdks:java:extensions:sql:expansion-service:shadowJar'
# Kafka
- type: renaming
transforms:
'ReadFromKafka': 'ReadFromKafka'
'WriteToKafka': 'WriteToKafka'
config:
mappings:
'ReadFromKafka':
'schema': 'schema'
'consumer_config': 'consumer_config_updates'
'format': 'format'
'topic': 'topic'
'bootstrap_servers': 'bootstrap_servers'
'confluent_schema_registry_url': 'confluent_schema_registry_url'
'confluent_schema_registry_subject': 'confluent_schema_registry_subject'
'auto_offset_reset_config': 'auto_offset_reset_config'
'error_handling': 'error_handling'
'file_descriptor_path': 'file_descriptor_path'
'message_name': 'message_name'
'WriteToKafka':
'format': 'format'
'topic': 'topic'
'bootstrap_servers': 'bootstrap_servers'
'producer_config_updates': 'producer_config_updates'
'error_handling': 'error_handling'
'file_descriptor_path': 'file_descriptor_path'
'message_name': 'message_name'
'schema': 'schema'
underlying_provider:
type: beamJar
transforms:
'ReadFromKafka': 'beam:schematransform:org.apache.beam:kafka_read:v1'
'WriteToKafka': 'beam:schematransform:org.apache.beam:kafka_write:v1'
config:
gradle_target: 'sdks:java:io:expansion-service:shadowJar'
managed_replacement:
# Following transforms may be replaced with equivalent managed transforms,
# if the pipelines 'updateCompatibilityBeamVersion' match the provided
# version.
'ReadFromKafka': '2.65.0'
'WriteToKafka': '2.65.0'
# PubSub
- type: renaming
transforms:
'ReadFromPubSubLite': 'ReadFromPubSubLite'
'WriteToPubSubLite': 'WriteToPubSubLite'
config:
mappings:
'ReadFromPubSubLite':
'project': 'project'
'schema': 'schema'
'format': 'format'
'subscription_name': 'subscription_name'
'location': 'location'
'attributes': 'attributes'
'attribute_map': 'attribute_map'
'attribute_id': 'attribute_id'
'error_handling': 'error_handling'
'file_descriptor_path': 'file_descriptor_path'
'message_name': 'message_name'
'WriteToPubSubLite':
'project': 'project'
'format': 'format'
'topic_name': 'topic_name'
'location': 'location'
'attributes': 'attributes'
'attribute_id': 'attribute_id'
'error_handling': 'error_handling'
'file_descriptor_path': 'file_descriptor_path'
'message_name': 'message_name'
'schema': 'schema'
underlying_provider:
type: beamJar
transforms:
'ReadFromPubSubLite': 'beam:schematransform:org.apache.beam:pubsublite_read:v1'
'WriteToPubSubLite': 'beam:schematransform:org.apache.beam:pubsublite_write:v1'
config:
gradle_target: 'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'
# TODO(yaml): Tests are assuming python providers are before java ones, hence
# the order below. This should be fixed in the future.
# Python Providers
- type: python
transforms:
'ReadFromBigQuery': 'apache_beam.yaml.yaml_io.read_from_bigquery'
# Disable until https://github.com/apache/beam/issues/28162 is resolved.
# 'WriteToBigQuery': 'apache_beam.yaml.yaml_io.write_to_bigquery'
'ReadFromText': 'apache_beam.yaml.yaml_io.read_from_text'
'WriteToText': 'apache_beam.yaml.yaml_io.write_to_text'
'ReadFromPubSub': 'apache_beam.yaml.yaml_io.read_from_pubsub'
'WriteToPubSub': 'apache_beam.yaml.yaml_io.write_to_pubsub'
'ReadFromIceberg': 'apache_beam.yaml.yaml_io.read_from_iceberg'
'WriteToIceberg': 'apache_beam.yaml.yaml_io.write_to_iceberg'
'ReadFromTFRecord': 'apache_beam.yaml.yaml_io.read_from_tfrecord'
'WriteToTFRecord': 'apache_beam.yaml.yaml_io.write_to_tfrecord'
# General File Formats
# Declared as a renaming transform to avoid exposing all
# (implementation-specific) pandas arguments and aligning with possible Java
# implementation.
# Invoking these directly as a PyTransform is still an option for anyone wanting
# to use these power-features in a language-dependent manner.
- type: renaming
transforms:
'ReadFromCsv': 'ReadFromCsv'
'WriteToCsv': 'WriteToCsv'
'ReadFromJson': 'ReadFromJson'
'WriteToJson': 'WriteToJson'
'ReadFromParquet': 'ReadFromParquet'
'WriteToParquet': 'WriteToParquet'
'ReadFromAvro': 'ReadFromAvro'
'WriteToAvro': 'WriteToAvro'
config:
mappings:
'ReadFromCsv':
path: 'path'
delimiter: 'sep'
comment: 'comment'
'WriteToCsv':
path: 'path'
delimiter: 'sep'
'ReadFromJson':
path: 'path'
'WriteToJson':
path: 'path'
'ReadFromParquet':
path: 'file_pattern'
'WriteToParquet':
path: 'file_path_prefix'
'ReadFromAvro':
path: 'file_pattern'
'WriteToAvro':
path: 'file_path_prefix'
defaults:
'ReadFromParquet':
as_rows: True
'ReadFromAvro':
as_rows: True
underlying_provider:
type: python
transforms:
'ReadFromCsv': 'apache_beam.io.ReadFromCsv'
'WriteToCsv': 'apache_beam.io.WriteToCsv'
'ReadFromJson': 'apache_beam.io.ReadFromJson'
'WriteToJson': 'apache_beam.io.WriteToJson'
'ReadFromParquet': 'apache_beam.io.ReadFromParquet'
'WriteToParquet': 'apache_beam.io.WriteToParquet'
'ReadFromAvro': 'apache_beam.io.ReadFromAvro'
'WriteToAvro': 'apache_beam.io.WriteToAvro'
# BeamJar Providers
- type: beamJar
transforms:
'WriteToCsv': 'beam:schematransform:org.apache.beam:csv_write:v1'
'WriteToJson': 'beam:schematransform:org.apache.beam:json_write:v1'
config:
gradle_target: 'sdks:java:extensions:schemaio-expansion-service:shadowJar'
# Databases
- type: renaming
transforms:
'ReadFromJdbc': 'ReadFromJdbc'
'WriteToJdbc': 'WriteToJdbc'
'ReadFromMySql': 'ReadFromMySql'
'WriteToMySql': 'WriteToMySql'
'ReadFromPostgres': 'ReadFromPostgres'
'WriteToPostgres': 'WriteToPostgres'
'ReadFromOracle': 'ReadFromOracle'
'WriteToOracle': 'WriteToOracle'
'ReadFromSqlServer': 'ReadFromSqlServer'
'WriteToSqlServer': 'WriteToSqlServer'
config:
mappings:
'ReadFromJdbc':
url: 'jdbc_url'
connection_init_sql: 'connection_init_sql'
connection_properties: 'connection_properties'
disable_auto_commit: 'disable_auto_commit'
driver_class_name: 'driver_class_name'
driver_jars: 'driver_jars'
fetch_size: 'fetch_size'
output_parallelization: 'output_parallelization'
password: 'password'
query: 'read_query'
table: 'location'
partition_column : 'partition_column'
num_partitions: 'num_partitions'
type: 'jdbc_type'
username: 'username'
'WriteToJdbc':
url: 'jdbc_url'
auto_sharding: 'autosharding'
connection_init_sql: 'connection_init_sql'
connection_properties: 'connection_properties'
driver_class_name: 'driver_class_name'
driver_jars: 'driver_jars'
password: 'password'
table: 'location'
batch_size: 'batch_size'
type: 'jdbc_type'
username: 'username'
query: 'write_statement'
'ReadFromMySql': 'ReadFromJdbc'
'WriteToMySql': 'WriteToJdbc'
'ReadFromPostgres': 'ReadFromJdbc'
'WriteToPostgres': 'WriteToJdbc'
'ReadFromOracle': 'ReadFromJdbc'
'WriteToOracle': 'WriteToJdbc'
'ReadFromSqlServer': 'ReadFromJdbc'
'WriteToSqlServer': 'WriteToJdbc'
defaults:
'ReadFromMySql':
driver_class_name: ''
driver_jars: ''
jdbc_type: ''
'WriteToMySql':
driver_class_name: ''
driver_jars: ''
jdbc_type: ''
'ReadFromPostgres':
connection_init_sql: ''
driver_class_name: ''
driver_jars: ''
jdbc_type: ''
'WriteToPostgres':
connection_init_sql: ''
driver_class_name: ''
driver_jars: ''
jdbc_type: ''
'ReadFromOracle':
connection_init_sql: ''
driver_class_name: ''
driver_jars: ''
jdbc_type: ''
'WriteToOracle':
connection_init_sql: ''
driver_class_name: ''
driver_jars: ''
jdbc_type: ''
'ReadFromSqlServer':
connection_init_sql: ''
driver_class_name: ''
driver_jars: ''
jdbc_type: ''
'WriteToSqlServer':
connection_init_sql: ''
driver_class_name: ''
driver_jars: ''
jdbc_type: ''
underlying_provider:
type: beamJar
transforms:
'ReadFromJdbc': 'beam:schematransform:org.apache.beam:jdbc_read:v1'
'ReadFromMySql': 'beam:schematransform:org.apache.beam:mysql_read:v1'
'ReadFromPostgres': 'beam:schematransform:org.apache.beam:postgres_read:v1'
'ReadFromOracle': 'beam:schematransform:org.apache.beam:oracle_read:v1'
'ReadFromSqlServer': 'beam:schematransform:org.apache.beam:sql_server_read:v1'
'WriteToJdbc': 'beam:schematransform:org.apache.beam:jdbc_write:v1'
'WriteToMySql': 'beam:schematransform:org.apache.beam:mysql_write:v1'
'WriteToPostgres': 'beam:schematransform:org.apache.beam:postgres_write:v1'
'WriteToOracle': 'beam:schematransform:org.apache.beam:oracle_write:v1'
'WriteToSqlServer': 'beam:schematransform:org.apache.beam:sql_server_write:v1'
config:
gradle_target: 'sdks:java:extensions:schemaio-expansion-service:shadowJar'
# Spanner
- type: renaming
transforms:
'ReadFromSpanner': 'ReadFromSpanner'
'WriteToSpanner': 'WriteToSpanner'
config:
mappings:
'ReadFromSpanner':
project: 'project_id'
instance: 'instance_id'
database: 'database_id'
table: 'table_id'
query: 'query'
columns: 'columns'
index: 'index'
batching: 'batching'
'WriteToSpanner':
project: 'project_id'
instance: 'instance_id'
database: 'database_id'
table: 'table_id'
error_handling: 'error_handling'
underlying_provider:
type: beamJar
transforms:
'ReadFromSpanner': 'beam:schematransform:org.apache.beam:spanner_read:v1'
'WriteToSpanner': 'beam:schematransform:org.apache.beam:spanner_write:v1'
config:
gradle_target: 'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'
# TFRecord
- type: renaming
transforms:
'ReadFromTFRecord': 'ReadFromTFRecord'
'WriteToTFRecord': 'WriteToTFRecord'
config:
mappings:
'ReadFromTFRecord':
file_pattern: 'file_pattern'
compression_type: 'compression'
validate: 'validate'
'WriteToTFRecord':
file_path_prefix: 'output_prefix'
shard_name_template: 'shard_template'
file_name_suffix: 'filename_suffix'
num_shards: 'num_shards'
compression_type: 'compression'
underlying_provider:
type: beamJar
transforms:
'ReadFromTFRecord': 'beam:schematransform:org.apache.beam:tfrecord_read:v1'
'WriteToTFRecord': 'beam:schematransform:org.apache.beam:tfrecord_write:v1'
config:
gradle_target: 'sdks:java:io:expansion-service:shadowJar'