in components/webscraper/main.go [339:414]
func saveResults(ctx context.Context, firestoreClient *firestore.Client, docRef *firestore.DocumentRef, scrapedDocs *[]ScrapedDocument) {
// Marshal the entire scrapedDocs to JSON to check the total size
fullResultDataJSON, err := json.Marshal(map[string]interface{}{
"scraped_documents": scrapedDocs,
})
if err != nil {
updateJobError(ctx, docRef, fmt.Errorf("error marshaling resultData: %v", err))
log.Print(err)
return
}
// Check if the total size exceeds the Firestore limit
// Keep the limit 1000000 instead of 1048576 to account for firestore padding
if len(fullResultDataJSON) > 1000000 {
log.Printf("resultData size exceeds Firestore limit. Truncating data.")
// If it exceeds, truncate the data to fit within the limit
var truncatedDocs []ScrapedDocument
truncatedSize := 0
// Iterate through the scraped documents and add them to the truncated list
// until the size is close to but does not exceed the limit
for _, doc := range *scrapedDocs {
docJSON, err := json.Marshal(doc)
if err != nil {
updateJobError(ctx, docRef, fmt.Errorf("error marshaling individual document: %v", err))
log.Print(err)
return
}
if truncatedSize+len(docJSON) > 1000000 {
// Stop adding documents if the next one would exceed the limit
break
}
truncatedDocs = append(truncatedDocs, doc)
truncatedSize += len(docJSON)
}
// Prepare the truncated data to be written to Firestore
resultData := map[string]interface{}{
"scraped_documents": truncatedDocs,
}
// Update Firestore with the truncated data
_, err = docRef.Update(ctx, []firestore.Update{
{Path: "result_data", Value: resultData},
{Path: "status", Value: "succeeded"},
})
if err != nil {
updateJobError(ctx, docRef, fmt.Errorf("failed to update job document with truncated data: %v", err))
log.Print(err)
return
}
log.Printf("Successfully updated job with truncated scraped documents")
return
}
// If the total size is within the limit, proceed with updating the Firestore document as is
resultData := map[string]interface{}{
"scraped_documents": scrapedDocs,
}
_, err = docRef.Update(ctx, []firestore.Update{
{Path: "result_data", Value: resultData},
{Path: "status", Value: "succeeded"},
})
if err != nil {
updateJobError(ctx, docRef, fmt.Errorf("failed to update job document: %v", err))
log.Print(err)
return
}
log.Printf("Successfully updated job with %d scraped documents", len(*scrapedDocs))
}