in notebooks/src/code/data/base.py [0:0]
def dataset_inputs(self) -> Generator[dict, None, None]:
"""Generate the sequence of manifest items with textract-ref URIs resolved locally
Whether this dataset was instantiated with a manifest file (for annotations) or just as a
folder of Amazon Textract JSON files, this method will yield a sequence of dicts containing
{'textract-ref': str} resolved to the *local* path of the file, plus whatever other fields
were present unchanged (in a manifest).
"""
if self.manifest_file_path:
with open(self.manifest_file_path, "r") as f:
for linenum, line in enumerate(f, start=1):
logger.debug("Reading manifest line %s", linenum)
record = json.loads(line)
if "textract-ref" not in record:
raise ValueError(
f"Manifest line {linenum} missing required field 'textract-ref'"
)
else:
textract_ref = record["textract-ref"]
if textract_ref.lower().startswith("s3://"):
# Map S3 URI to local path:
textract_ref = self.textract_s3uri_to_file_path(textract_ref)
else:
# textract_fle_path in manifest isn't an S3 URI - assume rel to channel
if textract_ref.startswith("/"):
textract_ref = self.textract_path + textract_ref[1:]
else:
textract_ref = self.textract_path + textract_ref
# Check the resolved file path exists:
if not os.path.isfile(textract_ref):
raise ValueError(
"(Manifest line {}) could not find textract file {}".format(
linenum,
textract_ref,
)
)
record["textract-ref"] = textract_ref
yield record
else:
for currpath, _, files in os.walk(self.textract_path):
for file in files:
yield {"textract-ref": os.path.join(currpath, file)}