src/package/dataplexutils/metadata/constants.toml (90 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[CLIENTS]
BIGQUERY = "bigquery"
DATAPLEX_DATA_SCAN = "dataplex_data_scan"
DATA_CATALOG_LINEAGE = "data_catalog_lineage"
[LOGGING]
WIZARD_LOGGER = "wizard_logger"
[LLM]
LLM_TYPE = "gemini-1.5-pro-001"
LLM_VISION_TYPE = "gemini-1.5-pro-001"
TEMPERATURE = 0.0
TOP_P = 1.0
TOP_K = 32
CANDIDATE_COUNT = 1
MAX_OUTPUT_TOKENS = 2048
[PROMPTS]
SYSTEM_PROMPT = """
You are a data steward. Your task is to produce human meaningful metadata.
"""
TABLE_DESCRIPTION_PROMPT_BASE = """
You need to produce metadata for a table.
The table fully qualified name is {table_fqn}
The table schema is {table_schema_str}
A sample of the rows is {table_sample}
"""
TABLE_DESCRIPTION_PROMPT_PROFILE = """
The table profile information is {table_profile}
"""
TABLE_DESCRIPTION_PROMPT_QUALITY = """
The table quality information is {table_quality}
"""
TABLE_DESCRIPTION_PROMPT_LINEAGE_TABLES = """
Lineage information:
The table is generated using the following tables {table_sources_info}
Produce two paragraphs:
"""
TABLE_DESCRIPTION_PROMPT_LINEAGE_PROCESSES = """
Lineage information:
The table is generated using the following queries {job_sources_info}
Produce two paragraphs:
"""
TABLE_DESCRIPTION_PROMPT_DOCUMENT = """
The attached documents provide additional information about this table and the source tables.
"""
TABLE_DESCRIPTION_GENERATION_BASE = """
The first paragraph starts with **Description** and then describe the contents and purpose of the table.
Describe table contents, business area and how the table can be used.
Do not include the description of the columns or data elements.
Do not base the description just on the provided table sample.
"""
TABLE_DESCRIPTION_GENERATION_LINEAGE = """
The second paragraph starts with **Source** and provide information how is that table is calculated from originating tables, mentiones level of aggregation,
which tables are joined, what data is filterd, what are most sifgnificant data transformations.
If there's no lineage information then second paragraph is ommited.
"""
OUTPUT_FORMAT_PROMPT = """
Do not use markdown, or do not the keywords like SQL at the beginning of answer, do not use quotes to enclose the response.
"""
COLUMN_DESCRIPTION_PROMPT_BASE = """
You need to produce short metadata in a form of description for a column of a table.
Input data:
The column that is described is {column_name}
The table fully qualified name is {table_fqn}
The table full schema is {table_schema_str}
A sample of the rows is {table_sample}
Describe only this column content. Please use approx 50 words only
"""
[DATA]
NUM_ROWS_TO_SAMPLE = 0
MAX_COLUMN_DESC_LENGTH = 1024
PDF_MIME_TYPE = "application/pdf"
[GENERATION_STRATEGY]
NAIVE = 1
DOCUMENTED = 2
DOCUMENTED_THEN_REST = 3
NAIVE_DOCUMENTED_AND_MATCHING_CONFIGURATION = 4
RANDOM = 8
ALPHABETICAL = 9