in env_setup.py [0:0]
def create_vector_store():
"""
Initializes the environment and sets up the vector store for Open Data QnA.
This function performs the following steps:
1. Loads configurations from the "config.ini" file.
2. Determines the data source (BigQuery or CloudSQL PostgreSQL) and sets the dataset region accordingly.
3. If the vector store is "cloudsql-pgvector" and the data source is not CloudSQL PostgreSQL, it creates a new PostgreSQL dataset for the vector store.
4. If logging is enabled or the vector store is "bigquery-vector", it creates a BigQuery dataset for the vector store and logging table.
5. It creates a Vertex AI connection for the specified model and embeds the table schemas and columns into the vector database.
6. If embeddings are stored in BigQuery, creates a table column_details_embeddings in the BigQuery Dataset.
7. It generates the embeddings for the table schemas and column descriptions, and then inserts those embeddings into the BigQuery table.
Configuration:
- Requires the following environment variables to be set in "config.ini":
- `DATA_SOURCE`: The data source (e.g., "bigquery" or "cloudsql-pg").
- `VECTOR_STORE`: The type of vector store (e.g., "bigquery-vector" or "cloudsql-pgvector").
- `BQ_REGION`: The BigQuery region.
- `PROJECT_ID`: The Google Cloud project ID.
- `BQ_OPENDATAQNA_DATASET_NAME`: The name of the BigQuery dataset for Open Data QnA.
- `LOGGING`: Whether logging is enabled.
- If `VECTOR_STORE` is "cloudsql-pgvector" and `DATA_SOURCE` is not "cloudsql-pg":
- Requires additional environment variables for PostgreSQL instance setup.
Returns:
None
Raises:
RuntimeError: If there are errors during the setup process (e.g., dataset creation failure).
"""
print("Initializing environment setup.")
print("Loading configurations from config.ini file.")
print("Vector Store source set to: ", VECTOR_STORE)
# Create PostgreSQL Instance is data source is different from PostgreSQL Instance
if VECTOR_STORE == 'cloudsql-pgvector' :
print("Generating pg dataset for vector store.")
# Parameters for PostgreSQL Instance
pg_region = DATASET_REGION
pg_instance = "pg15-opendataqna"
pg_database = "opendataqna-db"
pg_user = "pguser"
pg_password = "pg123"
pg_schema = 'pg-vector-store'
setup_postgresql(pg_instance, pg_region, pg_database, pg_user, pg_password)
# Create a new data set on Bigquery to use for the logs table
if LOGGING or VECTOR_STORE == 'bigquery-vector':
if LOGGING:
print("Logging is enabled")
if VECTOR_STORE == 'bigquery-vector':
print("Vector store set to 'bigquery-vector'")
print(f"Generating Big Query dataset {BQ_OPENDATAQNA_DATASET_NAME}")
client=bigquery.Client(project=PROJECT_ID)
dataset_ref = f"{PROJECT_ID}.{BQ_OPENDATAQNA_DATASET_NAME}"
# Create the dataset if it does not exist already
try:
client.get_dataset(dataset_ref)
print("Destination Dataset exists")
except google.api_core.exceptions.NotFound:
print("Cannot find the dataset hence creating.......")
dataset=bigquery.Dataset(dataset_ref)
dataset.location=DATASET_REGION
client.create_dataset(dataset)
print(str(dataset_ref)+" is created")