in internal/enricher/enricher.go [730:831]
func (mc *MetadataCollector) generateExampleValuesWithGemini(ctx context.Context, colInfo database.ColumnInfo, tableName string, originalExampleValues []string) ([]string, error) {
if mc.GeminiAPIKey == "" {
return nil, nil
}
if len(originalExampleValues) == 0 {
return []string{}, nil
}
dataTypeDescription := colInfo.DataType
exampleValuesStr := strings.Join(originalExampleValues, ", ")
prompt := fmt.Sprintf(`
You are an expert in data privacy and database metadata. Your task is to analyze a database column and determine if it likely contains Personally Identifiable Information (PII).
Based on your analysis, you will either return synthetic, representative example values or the original example values.
**Column Information:**
- Column Name: %s
- Table Name: %s
- Data Type: %s
- Original Example Values: [%s]
**Instructions:**
1. **Analyze for PII:** Based on the column name, data type, and example values, determine if this column is likely to contain PII.
Consider common patterns and keywords that indicate personal information (names, emails, phone numbers, addresses, IDs, etc.).
2. **Decision:**
- **If likely PII:** Generate equal number of synthetic example values that are representative of the data in the "%s" column but are completely fabricated and do not resemble real personal data.
The synthetic values should be consistent with the "%s" data type.
- **If NOT likely PII:** Return the original example values provided.
3. **Output Format:**
- If you generated synthetic values, output them as a comma-separated list enclosed in <synthetic_examples>...</synthetic_examples> tags.
- If you are returning the original values, output them as a comma-separated list enclosed in <original_examples>...</original_examples> tags.
Example Output for Synthetic Values:
<synthetic_examples>Fake Name 1, Fake Name 2, Fake Name 3</synthetic_examples>
Example Output for Original Values:
<original_examples>Value 1, Value 2, Value 3</original_examples>
Now, analyze the column and provide the appropriate output.
`, colInfo.Name, tableName, dataTypeDescription, exampleValuesStr, colInfo.Name, dataTypeDescription)
client, err := genai.NewClient(ctx, option.WithAPIKey(mc.GeminiAPIKey))
if err != nil {
return nil, fmt.Errorf("failed to create Gemini client: %w", err)
}
defer client.Close()
model_name := mc.Model
if model_name == "" {
model_name = "gemini-1.5-pro-002"
}
model := client.GenerativeModel(model_name)
model.SetTemperature(0.4)
model.SetMaxOutputTokens(500)
model.SetTopP(0.8)
model.SetTopK(40)
resp, err := model.GenerateContent(ctx, genai.Text(prompt))
if err != nil {
return nil, fmt.Errorf("Gemini API call failed for example value generation: %w", err)
}
responseString, err := extractTextFromResponseForExampleValues(resp)
if err != nil {
return nil, err
}
var exampleValues []string
if strings.Contains(responseString, "<synthetic_examples>") {
startTag := "<synthetic_examples>"
endTag := "</synthetic_examples>"
startIndex := strings.Index(responseString, startTag)
endIndex := strings.Index(responseString, endTag)
if startIndex != -1 && endIndex != -1 && startIndex < endIndex {
syntheticValueString := responseString[startIndex+len(startTag) : endIndex]
exampleValues = strings.Split(syntheticValueString, ",")
for i := range exampleValues {
exampleValues[i] = strings.TrimSpace(exampleValues[i])
}
log.Printf("INFO: Gemini determined column '%s' table '%s' is PII and generated synthetic examples.", colInfo.Name, tableName)
} else {
return nil, fmt.Errorf("invalid response format for synthetic examples from Gemini: tags not found")
}
} else if strings.Contains(responseString, "<original_examples>") {
startTag := "<original_examples>"
endTag := "</original_examples>"
startIndex := strings.Index(responseString, startTag)
endIndex := strings.Index(responseString, endTag)
if startIndex != -1 && endIndex != -1 && startIndex < endIndex {
originalValueString := responseString[startIndex+len(startTag) : endIndex]
exampleValues = strings.Split(originalValueString, ",")
for i := range exampleValues {
exampleValues[i] = strings.TrimSpace(exampleValues[i])
}
} else {
return nil, fmt.Errorf("invalid response format for original examples from Gemini: tags not found")
}
} else {
return nil, fmt.Errorf("unexpected response format from Gemini for example values: %s", responseString)
}
return exampleValues, nil
}