def generate_map_for_pdb()

in src/biotoolbox/contact_map_builder.py [0:0]


    def generate_map_for_pdb(self, structure_container):

        aligner = Align.PairwiseAligner()
        contact_maps = ContactMapContainer()
        model = structure_container.structure[0]

        for chain_name in structure_container.chains:
            chain = structure_container.chains[chain_name]
            contact_maps.with_chain(chain_name)
            self.speak(f"\nProcessing chain {chain_name}")

            if chain['seqres-seq'] is not None and len(chain['seqres-seq']) > 0:
                contact_maps.with_method_for_chain(chain_name, ALIGNED_BY_SEQRES)
                seqres_seq = chain['seqres-seq']
                atom_seq   = chain['atom-seq']

                alignment = aligner.align(seqres_seq, atom_seq)
                specific_alignment = next(alignment)
                self.speak(f"Seqres seq: {seqres_seq}",
                           f"Atom seq:   {atom_seq}",
                           specific_alignment, sep='\n')

                contact_maps.with_alignment_for_chain(chain_name, specific_alignment)

                # It's actually much easier to just have biopython generate the string alignment
                # and use that as a guide.
                pattern = specific_alignment.__str__().split("\n")
                aligned_seqres_seq, mask, aligned_atom_seq = pattern[:3]

                # Build a list of residues that we do have atoms for.
                residues = model[chain_name].get_residues()
                reindexed_residues = list(model[chain_name].get_residues())
                final_residue_list = []

                picked_residues = 0
                non_canonicals_or_het = 0

                for i in range(len(aligned_atom_seq)):
                    if aligned_seqres_seq[i] == '-':
                        # This is an inserted residue from the aligner that doesn't actually match any
                        # seqres line. Don't even insert a None.
                        continue
                    current_aligned_atom_residue_letter = aligned_atom_seq[i]
                    #  atom seq has a letter and the mask shows it corresponds to a seqres item
                    if current_aligned_atom_residue_letter != '-' and mask[i] == '|':
                        candidate_residue = next((x for x in reindexed_residues[picked_residues:picked_residues + 5] if
                                                  correct_residue(x, current_aligned_atom_residue_letter)), None)

                        if candidate_residue is None:
                            # The right answer is probably 'None' but we need to know why.
                            residue = reindexed_residues[picked_residues]
                            if residue.id[0].startswith('H_'):
                                non_canonicals_or_het += 1
                        else:
                            picked_residues += 1

                        final_residue_list.append(candidate_residue)
                    else:
                        final_residue_list.append(None)

                final_seq_three_letter_codes = ''.join(
                    [r.resname if r is not None else 'XXX' for r in final_residue_list])
                final_seq_one_letter_codes = seq1(final_seq_three_letter_codes, undef_code='-',
                                                  custom_map=protein_letters_3to1)
                self.speak(f"Final [len of seq {len(seqres_seq)}] [len of result {len(final_seq_one_letter_codes)}] "
                           f"[len of final residue list {len(final_residue_list)}]:\n{final_seq_one_letter_codes}")

                if self.pedantic and len(final_residue_list) != len(seqres_seq):
                    raise Exception(
                        f"Somehow the final residue list {len(final_residue_list)} doesn't match the size of the SEQRES seq {len(seqres_seq)}")

                if self.pedantic and (len(seqres_seq) != len(final_seq_one_letter_codes) != len(final_residue_list)):
                    raise Exception('The length of the SEQRES seq != length of final_seq_one_letter_codes != length of final residue list')

                sanity_check = aligned_atom_seq.replace('X', '')
                if self.pedantic and sanity_check != final_seq_one_letter_codes:
                    print(f"sanity_check {sanity_check}")
                    print(f"final_seq    {final_seq_one_letter_codes}")
                    count = sum(1 for a, b in zip(sanity_check, final_seq_one_letter_codes) if a != b)
                    # While going through the data we found some _very_ large structures in the PDB.
                    # Some of them have massive interior chains w/ tons of missing data. In this case
                    # we're basically just saying we did what we could, passing the data along and saying
                    # we still were in pedantic mode.
                    missing_residue_heuristic = sanity_check.count('-') / len(sanity_check)
                    missing_residue_heuristic_2 = final_seq_one_letter_codes.count('-') / len(final_seq_one_letter_codes)
                    if count == non_canonicals_or_het:
                        # Add a message about this.
                        print(f"Warning: The final sequence and the sanity check were different, but the difference equals the number of HETATMs or non-canonical residues. _Probably_ OK.")
                    elif missing_residue_heuristic >= 0.5 or missing_residue_heuristic_2 >= 0.5:
                        print(f"Warning: The final sequence and the sanity check were different. Over 50% of the chain is unresolved. Nothing we can do about it.")
                    else:
                        print("Vlada")
                        # raise Exception(
                        #    f'The final one letter SEQ generated from residues does not match the aligned atom seq (Diff count {count} but HETATM {non_canonicals_or_het})')

                contact_maps.with_final_seq_for_chain(chain_name, final_seq_one_letter_codes)
                contact_maps.with_chain_seq(chain_name, seqres_seq)
                contact_maps.with_map_for_chain(chain_name,
                                                self.__residue_list_to_contact_map(final_residue_list, len(seqres_seq)))
            else:
                contact_maps.with_method_for_chain(chain_name, ATOMS_ONLY)
                atom_seq = chain['atom-seq']
                residues = model[chain_name].get_residues()

                final_residue_list = []
                missing_alpha_carbons = []
                for r in residues:
                    try:
                        _ = r["CA"]
                        final_residue_list.append(r)
                    except KeyError:
                        missing_alpha_carbons.append(r)

                # Sanity checks
                final_seq_three_letter_codes = ''.join(
                    [r.resname if r is not None else 'XXX' for r in final_residue_list])
                final_seq_one_letter_codes = seq1(final_seq_three_letter_codes, undef_code='-',
                                                  custom_map=protein_letters_3to1)
                # print(final_seq_one_letter_codes)
                corrected_atom_seq = final_seq_one_letter_codes
                # End sanity checks

                contact_maps.with_chain_seq(chain_name, corrected_atom_seq)
                contact_maps.with_map_for_chain(chain_name,
                                                self.__residue_list_to_contact_map(final_residue_list, len(corrected_atom_seq)))

        return contact_maps