in scripts/scaffold.py [0:0]
def create_dataset_yaml(dataset_id: str):
dataset_yaml = {}
sample_yaml = yaml.load((PROJECT_ROOT / "samples" / "dataset.yaml").read_text())
sample_yaml["dataset"]["name"] = dataset_id
sample_yaml["dataset"]["friendly_name"] = dataset_id
dataset_desc = click.prompt("A user-friendly description of the dataset", type=str)
sample_yaml["dataset"]["description"] = dataset_desc
dataset_yaml["dataset"] = sample_yaml["dataset"]
resources = []
while True:
resource = click.prompt(
(
"\nWhich GCP Resource(s) are required for your pipeline\n"
"Select Resources Needed: BigQuery (BQ), Google Cloud Storage (GCS)?"
),
type=click.Choice(["BQ", "GCS", "None"], case_sensitive=False),
default="r",
)
if resource == "BQ":
resource = next(
res
for res in sample_yaml["resources"]
if res["type"] == "bigquery_dataset"
)
resource["dataset_id"] = dataset_id
bq_desc = click.prompt(
"\nA user-friendly description of the dataset", type=str
)
resource["description"] = bq_desc
resources.append(resource)
if resource == "GCS":
resource = next(
res
for res in sample_yaml["resources"]
if res["type"] == "storage_bucket"
)
gcs_bucket_name = click.prompt(
"\nYour Cloud Storage Bucket Name\n"
"Use hyphenated syntax, e.g. `some-prefix-123`, for the names.\n"
"Note that bucket names must not contain 'google' or close misspellings, such as 'g00gle'.",
type=str,
)
location = click.prompt(
(
"\nThe location of the bucket.\n"
"Object data for objects in the bucket resides in physical storage within this region.\n"
"Defaults to US."
),
type=click.Choice(["US", "EU", "ASIA"], case_sensitive=False),
default="US",
)
resource["name"] = gcs_bucket_name
resource["location"] = location
resources.append(resource)
if resource == "None":
break
dataset_yaml["resources"] = resources
with open(
f"{DATASETS_PATH}/{dataset_id}/pipelines/dataset.yaml", "w"
) as dataset_out:
dataset_out.write(license_header)
yaml.dump(CommentedMap(dataset_yaml), dataset_out)
click.echo(
f"\n{DATASETS_PATH}/{dataset_id}/pipelines/dataset.yaml has been created\n"
)