def dataset_inputs()

in notebooks/src/code/data/base.py [0:0]
32 lines of code
9 McCabe index (conditional complexity)

    def dataset_inputs(self) -> Generator[dict, None, None]:
        """Generate the sequence of manifest items with textract-ref URIs resolved locally

        Whether this dataset was instantiated with a manifest file (for annotations) or just as a
        folder of Amazon Textract JSON files, this method will yield a sequence of dicts containing
        {'textract-ref': str} resolved to the *local* path of the file, plus whatever other fields
        were present unchanged (in a manifest).
        """
        if self.manifest_file_path:
            with open(self.manifest_file_path, "r") as f:
                for linenum, line in enumerate(f, start=1):
                    logger.debug("Reading manifest line %s", linenum)
                    record = json.loads(line)
                    if "textract-ref" not in record:
                        raise ValueError(
                            f"Manifest line {linenum} missing required field 'textract-ref'"
                        )
                    else:
                        textract_ref = record["textract-ref"]
                        if textract_ref.lower().startswith("s3://"):
                            # Map S3 URI to local path:
                            textract_ref = self.textract_s3uri_to_file_path(textract_ref)
                        else:
                            # textract_fle_path in manifest isn't an S3 URI - assume rel to channel
                            if textract_ref.startswith("/"):
                                textract_ref = self.textract_path + textract_ref[1:]
                            else:
                                textract_ref = self.textract_path + textract_ref
                        # Check the resolved file path exists:
                        if not os.path.isfile(textract_ref):
                            raise ValueError(
                                "(Manifest line {}) could not find textract file {}".format(
                                    linenum,
                                    textract_ref,
                                )
                            )
                        record["textract-ref"] = textract_ref
                    yield record
        else:
            for currpath, _, files in os.walk(self.textract_path):
                for file in files:
                    yield {"textract-ref": os.path.join(currpath, file)}