in src/biotoolbox/contact_map_builder.py [0:0]
def generate_map_for_pdb(self, structure_container):
aligner = Align.PairwiseAligner()
contact_maps = ContactMapContainer()
model = structure_container.structure[0]
for chain_name in structure_container.chains:
chain = structure_container.chains[chain_name]
contact_maps.with_chain(chain_name)
self.speak(f"\nProcessing chain {chain_name}")
if chain['seqres-seq'] is not None and len(chain['seqres-seq']) > 0:
contact_maps.with_method_for_chain(chain_name, ALIGNED_BY_SEQRES)
seqres_seq = chain['seqres-seq']
atom_seq = chain['atom-seq']
alignment = aligner.align(seqres_seq, atom_seq)
specific_alignment = next(alignment)
self.speak(f"Seqres seq: {seqres_seq}",
f"Atom seq: {atom_seq}",
specific_alignment, sep='\n')
contact_maps.with_alignment_for_chain(chain_name, specific_alignment)
# It's actually much easier to just have biopython generate the string alignment
# and use that as a guide.
pattern = specific_alignment.__str__().split("\n")
aligned_seqres_seq, mask, aligned_atom_seq = pattern[:3]
# Build a list of residues that we do have atoms for.
residues = model[chain_name].get_residues()
reindexed_residues = list(model[chain_name].get_residues())
final_residue_list = []
picked_residues = 0
non_canonicals_or_het = 0
for i in range(len(aligned_atom_seq)):
if aligned_seqres_seq[i] == '-':
# This is an inserted residue from the aligner that doesn't actually match any
# seqres line. Don't even insert a None.
continue
current_aligned_atom_residue_letter = aligned_atom_seq[i]
# atom seq has a letter and the mask shows it corresponds to a seqres item
if current_aligned_atom_residue_letter != '-' and mask[i] == '|':
candidate_residue = next((x for x in reindexed_residues[picked_residues:picked_residues + 5] if
correct_residue(x, current_aligned_atom_residue_letter)), None)
if candidate_residue is None:
# The right answer is probably 'None' but we need to know why.
residue = reindexed_residues[picked_residues]
if residue.id[0].startswith('H_'):
non_canonicals_or_het += 1
else:
picked_residues += 1
final_residue_list.append(candidate_residue)
else:
final_residue_list.append(None)
final_seq_three_letter_codes = ''.join(
[r.resname if r is not None else 'XXX' for r in final_residue_list])
final_seq_one_letter_codes = seq1(final_seq_three_letter_codes, undef_code='-',
custom_map=protein_letters_3to1)
self.speak(f"Final [len of seq {len(seqres_seq)}] [len of result {len(final_seq_one_letter_codes)}] "
f"[len of final residue list {len(final_residue_list)}]:\n{final_seq_one_letter_codes}")
if self.pedantic and len(final_residue_list) != len(seqres_seq):
raise Exception(
f"Somehow the final residue list {len(final_residue_list)} doesn't match the size of the SEQRES seq {len(seqres_seq)}")
if self.pedantic and (len(seqres_seq) != len(final_seq_one_letter_codes) != len(final_residue_list)):
raise Exception('The length of the SEQRES seq != length of final_seq_one_letter_codes != length of final residue list')
sanity_check = aligned_atom_seq.replace('X', '')
if self.pedantic and sanity_check != final_seq_one_letter_codes:
print(f"sanity_check {sanity_check}")
print(f"final_seq {final_seq_one_letter_codes}")
count = sum(1 for a, b in zip(sanity_check, final_seq_one_letter_codes) if a != b)
# While going through the data we found some _very_ large structures in the PDB.
# Some of them have massive interior chains w/ tons of missing data. In this case
# we're basically just saying we did what we could, passing the data along and saying
# we still were in pedantic mode.
missing_residue_heuristic = sanity_check.count('-') / len(sanity_check)
missing_residue_heuristic_2 = final_seq_one_letter_codes.count('-') / len(final_seq_one_letter_codes)
if count == non_canonicals_or_het:
# Add a message about this.
print(f"Warning: The final sequence and the sanity check were different, but the difference equals the number of HETATMs or non-canonical residues. _Probably_ OK.")
elif missing_residue_heuristic >= 0.5 or missing_residue_heuristic_2 >= 0.5:
print(f"Warning: The final sequence and the sanity check were different. Over 50% of the chain is unresolved. Nothing we can do about it.")
else:
print("Vlada")
# raise Exception(
# f'The final one letter SEQ generated from residues does not match the aligned atom seq (Diff count {count} but HETATM {non_canonicals_or_het})')
contact_maps.with_final_seq_for_chain(chain_name, final_seq_one_letter_codes)
contact_maps.with_chain_seq(chain_name, seqres_seq)
contact_maps.with_map_for_chain(chain_name,
self.__residue_list_to_contact_map(final_residue_list, len(seqres_seq)))
else:
contact_maps.with_method_for_chain(chain_name, ATOMS_ONLY)
atom_seq = chain['atom-seq']
residues = model[chain_name].get_residues()
final_residue_list = []
missing_alpha_carbons = []
for r in residues:
try:
_ = r["CA"]
final_residue_list.append(r)
except KeyError:
missing_alpha_carbons.append(r)
# Sanity checks
final_seq_three_letter_codes = ''.join(
[r.resname if r is not None else 'XXX' for r in final_residue_list])
final_seq_one_letter_codes = seq1(final_seq_three_letter_codes, undef_code='-',
custom_map=protein_letters_3to1)
# print(final_seq_one_letter_codes)
corrected_atom_seq = final_seq_one_letter_codes
# End sanity checks
contact_maps.with_chain_seq(chain_name, corrected_atom_seq)
contact_maps.with_map_for_chain(chain_name,
self.__residue_list_to_contact_map(final_residue_list, len(corrected_atom_seq)))
return contact_maps