in python/dgllife/data/pdbbind.py [0:0]
def _read_data_files(self, pdb_version, subset, load_binding_pocket, remove_coreset_from_refinedset, local_path):
"""Download and extract pdbbind data files specified by the version"""
root_dir_path = get_download_dir()
if local_path:
if local_path[-1] != '/':
local_path += '/'
index_label_file = glob.glob(local_path + '*' + subset + '*data*')[0]
elif pdb_version == 'v2015':
self._url = 'dataset/pdbbind_v2015.tar.gz'
data_path = root_dir_path + '/pdbbind_v2015.tar.gz'
extracted_data_path = root_dir_path + '/pdbbind_v2015'
download(_get_dgl_url(self._url), path=data_path, overwrite=False)
extract_archive(data_path, extracted_data_path)
if subset == 'core':
index_label_file = extracted_data_path + '/v2015/INDEX_core_data.2013'
elif subset == 'refined':
index_label_file = extracted_data_path + '/v2015/INDEX_refined_data.2015'
else:
raise ValueError('Expect the subset_choice to be either core or refined, got {}'.format(subset))
elif pdb_version == 'v2007':
self._url = 'dataset/pdbbind_v2007.tar.gz'
data_path = root_dir_path + '/pdbbind_v2007.tar.gz'
extracted_data_path = root_dir_path + '/pdbbind_v2007'
download(_get_dgl_url(self._url), path=data_path, overwrite=False)
extract_archive(data_path, extracted_data_path, overwrite=False)
extracted_data_path += '/home/ubuntu' # extra layer
# DataFrame containing the pdbbind_2007_agglomerative_split.txt
self.agg_split = pd.read_csv(extracted_data_path + '/v2007/pdbbind_2007_agglomerative_split.txt')
self.agg_split.rename(columns={'PDB ID':'PDB_code', 'Sequence-based assignment':'sequence', 'Structure-based assignment':'structure'}, inplace=True)
self.agg_split.loc[self.agg_split['PDB_code']=='1.00E+66', 'PDB_code'] = '1e66' # fix typo
if subset == 'core':
index_label_file = extracted_data_path + '/v2007/INDEX.2007.core.data'
elif subset == 'refined':
index_label_file = extracted_data_path + '/v2007/INDEX.2007.refined.data'
else:
raise ValueError('Expect the subset_choice to be either core or refined, got {}'.format(subset))
contents = []
with open(index_label_file, 'r') as f:
for line in f.readlines():
if line[0] != "#":
splitted_elements = line.split()
if pdb_version == 'v2015':
if len(splitted_elements) == 8:
# Ignore "//"
contents.append(splitted_elements[:5] + splitted_elements[6:])
else:
print('Incorrect data format.')
print(splitted_elements)
elif pdb_version == 'v2007':
if len(splitted_elements) == 6:
contents.append(splitted_elements)
else:
contents.append(splitted_elements[:5] + [' '.join(splitted_elements[5:])])
if pdb_version == 'v2015':
self.df = pd.DataFrame(contents, columns=(
'PDB_code', 'resolution', 'release_year',
'-logKd/Ki', 'Kd/Ki', 'reference', 'ligand_name'))
elif pdb_version == 'v2007':
self.df = pd.DataFrame(contents, columns=(
'PDB_code', 'resolution', 'release_year',
'-logKd/Ki', 'Kd/Ki', 'cluster_ID'))
# remove core set from refined set if using refined
if remove_coreset_from_refinedset and subset == 'refined':
if local_path:
core_path = glob.glob(local_path + '*core*data*')[0]
elif pdb_version == 'v2015':
core_path = extracted_data_path + '/v2015/INDEX_core_data.2013'
elif pdb_version == 'v2007':
core_path = extracted_data_path + '/v2007/INDEX.2007.core.data'
core_pdbs = []
with open(core_path,'r') as f:
for line in f:
fields = line.strip().split()
if fields[0] != "#":
core_pdbs.append(fields[0])
non_core_ids = []
for i in range(len(self.df)):
if self.df['PDB_code'][i] not in core_pdbs:
non_core_ids.append(i)
self.df = self.df.iloc[non_core_ids]
pdbs = self.df['PDB_code'].tolist()
if local_path:
pdb_path = local_path
else:
pdb_path = os.path.join(extracted_data_path, pdb_version)
print('Loading PDBBind data from', pdb_path)
self.ligand_files = [os.path.join(pdb_path, pdb, '{}_ligand.sdf'.format(pdb)) for pdb in pdbs]
if load_binding_pocket:
self.protein_files = [os.path.join(pdb_path, pdb, '{}_pocket.pdb'.format(pdb)) for pdb in pdbs]
else:
self.protein_files = [os.path.join(pdb_path, pdb, '{}_protein.pdb'.format(pdb)) for pdb in pdbs]