in components/dpu-workflow/src/utils/gcs_utils.py [0:0]
def extract_classifier_result(self, blob):
"""
Extracts classifier results from the classifier output JSON file Cloud Storage bucket.
This function efficiently extracts classifier results for the JSON file.
To optimize performance, it first attempts to partially download the file
(up to `self.partial_read_length` bytes) since classifier results are typically located
at the beginning of the file.
It checks for the presence of specific keywords ('entities', 'form', '}]')
to ensure the partial download contains the complete result set. If successful,
it parses the downloaded string and returns the extracted entities.
If partial download fails or encounters errors, it falls back to downloading
the entire file and uses the DocAI library for parsing.
Args:
blob: The blob object containing the classifier result JSON file.
Returns:
A list of `ClassifierResultEntity` objects representing the extracted entities.
"""
try:
download_str = blob.download_as_string(
start=0, end=self.partial_read_length
)
if FormClassifierResult.OBJ_ARRAY_END_IDENTIFIER in download_str and all(
keyword.lower() in download_str.lower()
for keyword in self.content_keywords
):
result_obj_str = (
download_str[
: download_str.index(
FormClassifierResult.OBJ_ARRAY_END_IDENTIFIER
)
+ 2
]
+ FormClassifierResult.OBJ_TERMINATION_CHAR
)
result_obj = json.loads(result_obj_str)
return [ClassifierResultEntity(ent) for ent in result_obj["entities"]]
except Exception as e:
logging.info(
f"Fail to extract classifier result using partial download for file: {blob.name}, with error: {e},"
f" fall back to download the complete file and use DocAI library to deserialize the result"
)
document = documentai.Document.from_json(
blob.download_as_bytes(), ignore_unknown_fields=True
)
return [
FormClassifierResult.transform_docai_entity_to_obj(e)
for e in document.entities
]