def _read_data_files()

in python/dgllife/data/pdbbind.py [0:0]


    def _read_data_files(self, pdb_version, subset, load_binding_pocket, remove_coreset_from_refinedset, local_path):
        """Download and extract pdbbind data files specified by the version"""
        root_dir_path = get_download_dir()
        if local_path:
            if local_path[-1] != '/':
                local_path += '/'
            index_label_file = glob.glob(local_path + '*' + subset + '*data*')[0]
        elif pdb_version == 'v2015':
            self._url = 'dataset/pdbbind_v2015.tar.gz'
            data_path = root_dir_path + '/pdbbind_v2015.tar.gz'
            extracted_data_path = root_dir_path + '/pdbbind_v2015'
            download(_get_dgl_url(self._url), path=data_path, overwrite=False)
            extract_archive(data_path, extracted_data_path)

            if subset == 'core':
                index_label_file = extracted_data_path + '/v2015/INDEX_core_data.2013'
            elif subset == 'refined':
                index_label_file = extracted_data_path + '/v2015/INDEX_refined_data.2015'
            else:
                raise ValueError('Expect the subset_choice to be either core or refined, got {}'.format(subset))
        elif pdb_version == 'v2007':
            self._url = 'dataset/pdbbind_v2007.tar.gz'
            data_path = root_dir_path + '/pdbbind_v2007.tar.gz'
            extracted_data_path = root_dir_path + '/pdbbind_v2007'
            download(_get_dgl_url(self._url), path=data_path, overwrite=False)
            extract_archive(data_path, extracted_data_path, overwrite=False)
            extracted_data_path += '/home/ubuntu' # extra layer

            # DataFrame containing the pdbbind_2007_agglomerative_split.txt
            self.agg_split = pd.read_csv(extracted_data_path + '/v2007/pdbbind_2007_agglomerative_split.txt')
            self.agg_split.rename(columns={'PDB ID':'PDB_code', 'Sequence-based assignment':'sequence', 'Structure-based assignment':'structure'}, inplace=True)
            self.agg_split.loc[self.agg_split['PDB_code']=='1.00E+66', 'PDB_code'] = '1e66' # fix typo
            if subset == 'core':
                index_label_file = extracted_data_path + '/v2007/INDEX.2007.core.data'
            elif subset == 'refined':
                index_label_file = extracted_data_path + '/v2007/INDEX.2007.refined.data'
            else:
                raise ValueError('Expect the subset_choice to be either core or refined, got {}'.format(subset))

        contents = []
        with open(index_label_file, 'r') as f:
            for line in f.readlines():
                if line[0] != "#":
                    splitted_elements = line.split()
                    if pdb_version == 'v2015':
                        if len(splitted_elements) == 8:
                            # Ignore "//"
                            contents.append(splitted_elements[:5] + splitted_elements[6:])
                        else:
                            print('Incorrect data format.')
                            print(splitted_elements)
                    elif pdb_version == 'v2007':
                        if len(splitted_elements) == 6:
                            contents.append(splitted_elements)
                        else:
                            contents.append(splitted_elements[:5] + [' '.join(splitted_elements[5:])])

        if pdb_version == 'v2015':
            self.df = pd.DataFrame(contents, columns=(
                'PDB_code', 'resolution', 'release_year',
                '-logKd/Ki', 'Kd/Ki', 'reference', 'ligand_name'))
        elif pdb_version == 'v2007':
            self.df = pd.DataFrame(contents, columns=(
                'PDB_code', 'resolution', 'release_year',
                '-logKd/Ki', 'Kd/Ki', 'cluster_ID'))

        # remove core set from refined set if using refined
        if remove_coreset_from_refinedset and subset == 'refined':
            if local_path:
                core_path = glob.glob(local_path + '*core*data*')[0]
            elif pdb_version == 'v2015':
                core_path = extracted_data_path + '/v2015/INDEX_core_data.2013'
            elif pdb_version == 'v2007':
                core_path = extracted_data_path + '/v2007/INDEX.2007.core.data'

            core_pdbs = []
            with open(core_path,'r') as f:
                for line in f:
                    fields = line.strip().split()
                    if fields[0] != "#":
                        core_pdbs.append(fields[0])

            non_core_ids = []
            for i in range(len(self.df)):
                if self.df['PDB_code'][i] not in core_pdbs:
                    non_core_ids.append(i)
            self.df = self.df.iloc[non_core_ids]

        pdbs = self.df['PDB_code'].tolist()

        if local_path:
            pdb_path = local_path
        else:
            pdb_path = os.path.join(extracted_data_path, pdb_version)
        print('Loading PDBBind data from', pdb_path)
        self.ligand_files = [os.path.join(pdb_path, pdb, '{}_ligand.sdf'.format(pdb)) for pdb in pdbs]
        if load_binding_pocket:
            self.protein_files = [os.path.join(pdb_path, pdb, '{}_pocket.pdb'.format(pdb)) for pdb in pdbs]
        else:
            self.protein_files = [os.path.join(pdb_path, pdb, '{}_protein.pdb'.format(pdb)) for pdb in pdbs]