in src/loading_manifest/osdu_document_manifest.py [0:0]
def create_document_manifest_from_path(
input_path, output_path,
preload_file_path, file_source,
include_files, exclude_files,
schema_ns_name, schema_ns_value,
acl, legal, schema_version, dict_schemas):
# check supported schema version
if schema_version not in SUPPORTED_SCHEMA_VERSIONS:
logging.error("Schema version %s is not in the supported list: %s", schema_version,
SUPPORTED_SCHEMA_VERSIONS)
logging.info("Generated 0 document load manifests.")
return
# list doc filenames
files = os.listdir(input_path)
if include_files is not None and len(include_files) > 0:
files_included = []
include_patterns = include_files.split(";")
for f in files:
for include_pattern in include_patterns:
if fnmatch.fnmatch(f, include_pattern):
files_included.append(f)
break
files = files_included
if exclude_files is not None and len(exclude_files) > 0:
files_excluded = []
exclude_patterns = exclude_files.split(";")
for f in files:
for exclude_pattern in exclude_patterns:
if fnmatch.fnmatch(f, exclude_pattern):
files_excluded.append(f)
break
files = [f for f in files if f not in files_excluded]
valid_files = []
logging.info("Checking {} files".format(len(files)))
for file in sorted(files):
file_doc = file.strip()
index = file_doc.rfind(".")
if 0 < index < len(file_doc)-1:
# minimum check: file and size
if cm.is_nonzero_file(input_path, file_doc):
valid_files.append(file_doc)
processed_files = []
logging.info("Processing {} files".format(len(valid_files)))
file_seq_for_too_long = 1
for valid_file in valid_files:
index = valid_file.rfind(".")
document_file = valid_file[0:index]
file_type = valid_file[index+1:]
output_file = os.path.join(output_path,
"load_document_" + schema_version + "_"
+ document_file + "_" + file_type + ".json")
if len(output_file) > 259:
output_file = os.path.join(output_path,
"load_document_long_document_name_" + schema_version + "_"
+ str(file_seq_for_too_long)
+ "_" + file_type + ".json")
file_seq_for_too_long = file_seq_for_too_long + 1
try:
with open(output_file, "w") as f:
json.dump(
obj=create_document_manifest(
document_name=document_file,
file_type=file_type,
preload_file_path=preload_file_path,
file_source=file_source,
schema_ns_name=schema_ns_name,
schema_ns_value=schema_ns_value,
acl=acl,
legal=legal,
schema_version=schema_version,
dict_schemas=dict_schemas
),
fp=f,
indent=4
)
processed_files.append(document_file)
except Exception:
logging.exception("Unable to process document file: {}".format(document_file))
os.remove(output_file)
logging.info("Generated {} document load manifests.".format(len(processed_files)))