tools/datagen-bq-to-bq/file_processing_utils.py [12:64]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def safe_strtobool(val):
    """Converts a string to boolean in a more robust way."""
    val = val.lower()
    if val in ("yes", "true", "t", "y", "1"):
        return True
    elif val in ("no", "false", "f", "n", "0"):
        return False
    else:
        raise ValueError(f"Invalid boolean string: '{val}'")


def file_pre_processing(
    gemini_model,
    input_gcs_path,  # Pass the map generated above
    staging_gcs_path,  # Pass the general staging path for intermediate files
    header_gcs_path,  # Pass the (potentially empty) header_gcs_path map
    table_attributes,
):
    for table_name, gcs_path in input_gcs_path.items():
        # Create a nested dictionary for 'table_name' before assigning values
        table_attributes[table_name] = {}

        # Extract of the sample rows from the GCS File Path
        file_name, sample_rows = gcs_ops.get_filename_sample_rows(gcs_path)
        sample_rows_str = "\n".join(map(str, sample_rows))

        # Prediction of Header Flag
        header_prediction_prompt = prompts_collection.header_prediction(sample_rows_str)
        header_flag = gemini_model.generate_content([header_prediction_prompt]).text
        table_attributes[table_name]["column_header_flag"] = safe_strtobool(
            header_flag.strip()
        )

        # Extraction of Custom Header
        content = Content(
            parts=[
                Part.from_text(prompts_collection.Custom_Header_Extract_Prompt),
                Part.from_text(f"File Data: {sample_rows_str}"),
            ]
        )
        content.role = "user"  # Set the role to "user"
        custom_header = gemini_model.generate_content([content]).text
        table_attributes[table_name]["custom_header"] = custom_header

        # Predicting the Schema of the File
        content = Content(
            parts=[
                Part.from_text(prompts_collection.Schema_Prediction_Prompt),
                Part.from_text(f"file_name: {file_name}"),
                Part.from_text(f"sample_rows: {sample_rows_str}"),
                Part.from_text(f"header_flag: {header_flag}"),
            ]
        )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



tools/datagen-gcs-to-gcs/file_processing_utils.py [13:65]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def safe_strtobool(val):
    """Converts a string to boolean in a more robust way."""
    val = val.lower()
    if val in ("yes", "true", "t", "y", "1"):
        return True
    elif val in ("no", "false", "f", "n", "0"):
        return False
    else:
        raise ValueError(f"Invalid boolean string: '{val}'")


def file_pre_processing(
    gemini_model,
    input_gcs_path,  # Pass the map generated above
    staging_gcs_path,  # Pass the general staging path for intermediate files
    header_gcs_path,  # Pass the (potentially empty) header_gcs_path map
    table_attributes,
):
    for table_name, gcs_path in input_gcs_path.items():
        # Create a nested dictionary for 'table_name' before assigning values
        table_attributes[table_name] = {}

        # Extract of the sample rows from the GCS File Path
        file_name, sample_rows = gcs_ops.get_filename_sample_rows(gcs_path)
        sample_rows_str = "\n".join(map(str, sample_rows))

        # Prediction of Header Flag
        header_prediction_prompt = prompts_collection.header_prediction(sample_rows_str)
        header_flag = gemini_model.generate_content([header_prediction_prompt]).text
        table_attributes[table_name]["column_header_flag"] = safe_strtobool(
            header_flag.strip()
        )

        # Extraction of Custom Header
        content = Content(
            parts=[
                Part.from_text(prompts_collection.Custom_Header_Extract_Prompt),
                Part.from_text(f"File Data: {sample_rows_str}"),
            ]
        )
        content.role = "user"  # Set the role to "user"
        custom_header = gemini_model.generate_content([content]).text
        table_attributes[table_name]["custom_header"] = custom_header

        # Predicting the Schema of the File
        content = Content(
            parts=[
                Part.from_text(prompts_collection.Schema_Prediction_Prompt),
                Part.from_text(f"file_name: {file_name}"),
                Part.from_text(f"sample_rows: {sample_rows_str}"),
                Part.from_text(f"header_flag: {header_flag}"),
            ]
        )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



