def create_document_manifest_from_path()

in src/loading_manifest/osdu_document_manifest.py [0:0]


def create_document_manifest_from_path(
        input_path, output_path,
        preload_file_path, file_source,
        include_files, exclude_files,
        schema_ns_name, schema_ns_value,
        acl, legal, schema_version, dict_schemas):

    # check supported schema version
    if schema_version not in SUPPORTED_SCHEMA_VERSIONS:
        logging.error("Schema version %s is not in the supported list: %s", schema_version,
                      SUPPORTED_SCHEMA_VERSIONS)
        logging.info("Generated 0 document load manifests.")
        return

    # list doc filenames
    files = os.listdir(input_path)
    if include_files is not None and len(include_files) > 0:
        files_included = []
        include_patterns = include_files.split(";")
        for f in files:
            for include_pattern in include_patterns:
                if fnmatch.fnmatch(f, include_pattern):
                    files_included.append(f)
                    break
        files = files_included
    if exclude_files is not None and len(exclude_files) > 0:
        files_excluded = []
        exclude_patterns = exclude_files.split(";")
        for f in files:
            for exclude_pattern in exclude_patterns:
                if fnmatch.fnmatch(f, exclude_pattern):
                    files_excluded.append(f)
                    break
        files = [f for f in files if f not in files_excluded]

    valid_files = []
    logging.info("Checking {} files".format(len(files)))
    for file in sorted(files):
        file_doc = file.strip()
        index = file_doc.rfind(".")
        if 0 < index < len(file_doc)-1:
            # minimum check: file and size
            if cm.is_nonzero_file(input_path, file_doc):
                valid_files.append(file_doc)

    processed_files = []
    logging.info("Processing {} files".format(len(valid_files)))
    file_seq_for_too_long = 1
    for valid_file in valid_files:
        index = valid_file.rfind(".")
        document_file = valid_file[0:index]
        file_type = valid_file[index+1:]
        output_file = os.path.join(output_path,
                                   "load_document_" + schema_version + "_"
                                   + document_file + "_" + file_type + ".json")
        if len(output_file) > 259:
            output_file = os.path.join(output_path,
                                       "load_document_long_document_name_" + schema_version + "_"
                                       + str(file_seq_for_too_long)
                                       + "_" + file_type + ".json")
            file_seq_for_too_long = file_seq_for_too_long + 1
        try:
            with open(output_file, "w") as f:
                json.dump(
                    obj=create_document_manifest(
                        document_name=document_file,
                        file_type=file_type,
                        preload_file_path=preload_file_path,
                        file_source=file_source,
                        schema_ns_name=schema_ns_name,
                        schema_ns_value=schema_ns_value,
                        acl=acl,
                        legal=legal,
                        schema_version=schema_version,
                        dict_schemas=dict_schemas
                    ),
                    fp=f,
                    indent=4
                )
                processed_files.append(document_file)
        except Exception:
            logging.exception("Unable to process document file: {}".format(document_file))
            os.remove(output_file)

    logging.info("Generated {} document load manifests.".format(len(processed_files)))