in datacatalog/snippets/create_fileset.py [0:0]
def create_fileset(override_values):
"""Creates a fileset within an entry group."""
# [START data_catalog_create_fileset]
# Import required modules.
from google.cloud import datacatalog_v1
# TODO: Set these values before running the sample.
project_id = "project_id"
fileset_entry_group_id = "entry_group_id"
fileset_entry_id = "entry_id"
# [END data_catalog_create_fileset]
# To facilitate testing, we replace values with alternatives
# provided by the testing harness.
project_id = override_values.get("project_id", project_id)
fileset_entry_group_id = override_values.get(
"fileset_entry_group_id", fileset_entry_group_id
)
fileset_entry_id = override_values.get("fileset_entry_id", fileset_entry_id)
# [START data_catalog_create_fileset]
# For all regions available, see:
# https://cloud.google.com/data-catalog/docs/concepts/regions
location = "us-central1"
datacatalog = datacatalog_v1.DataCatalogClient()
# Create an Entry Group.
entry_group_obj = datacatalog_v1.types.EntryGroup()
entry_group_obj.display_name = "My Fileset Entry Group"
entry_group_obj.description = "This Entry Group consists of ...."
entry_group = datacatalog.create_entry_group(
parent=datacatalog_v1.DataCatalogClient.common_location_path(
project_id, location
),
entry_group_id=fileset_entry_group_id,
entry_group=entry_group_obj,
)
print(f"Created entry group: {entry_group.name}")
# Create a Fileset Entry.
entry = datacatalog_v1.types.Entry()
entry.display_name = "My Fileset"
entry.description = "This fileset consists of ...."
entry.gcs_fileset_spec.file_patterns.append("gs://my_bucket/*.csv")
entry.type_ = datacatalog_v1.EntryType.FILESET
# Create the Schema, for example when you have a csv file.
entry.schema.columns.append(
datacatalog_v1.types.ColumnSchema(
column="first_name",
description="First name",
mode="REQUIRED",
type_="STRING",
)
)
entry.schema.columns.append(
datacatalog_v1.types.ColumnSchema(
column="last_name", description="Last name", mode="REQUIRED", type_="STRING"
)
)
# Create the addresses parent column
addresses_column = datacatalog_v1.types.ColumnSchema(
column="addresses", description="Addresses", mode="REPEATED", type_="RECORD"
)
# Create sub columns for the addresses parent column
addresses_column.subcolumns.append(
datacatalog_v1.types.ColumnSchema(
column="city", description="City", mode="NULLABLE", type_="STRING"
)
)
addresses_column.subcolumns.append(
datacatalog_v1.types.ColumnSchema(
column="state", description="State", mode="NULLABLE", type_="STRING"
)
)
entry.schema.columns.append(addresses_column)
entry = datacatalog.create_entry(
parent=entry_group.name, entry_id=fileset_entry_id, entry=entry
)
print(f"Created fileset entry: {entry.name}")