in data/prepare_data.py [0:0]
def main():
"""
Data preparation main script: Load data, parses PDB, processes structures, segregate records and write to disk. Configuration via commandline arguments.
Args:
Return:
"""
args = parse_args()
# 1. Load data
df = pd.read_csv(args.data_file)
# PDB parser
sloppyparser = PDBParser(
QUIET=True,
PERMISSIVE=True,
structure_builder=xpdb.SloppyStructureBuilder(),
)
# 2. Parallel parsing structures and converting to protein records
records = Parallel(n_jobs=-1)(
delayed(parse_pdb_gz_to_json_record)(
sloppyparser,
df.iloc[i]["primary"],
df.iloc[i]["structure_path"],
df.iloc[i]["structure_path"].split("/")[-1],
)
for i in tqdm(range(df.shape[0]))
)
# 3. Segregate records by splits
splitted_records = defaultdict(list)
for i, rec in enumerate(records):
row = df.iloc[i]
target = row[args.target_variable]
split = row["split"]
rec["target"] = target
splitted_records[split].append(rec)
# 4. write to disk
for split, records in splitted_records.items():
print(split, "number of proteins:", len(records))
outfile = os.path.join(args.output, f"proteins_{split}.json")
json.dump(records, open(outfile, "w"))
return None