in src/analysis/notebook_utils.py [0:0]
def merge_chunked_msa(
results: Sequence[Mapping[str, Any]],
max_hits: Optional[int] = None
) -> parsers.Msa:
"""Merges chunked database hits together into hits for the full database."""
unsorted_results = []
for chunk_index, chunk in enumerate(results):
msa = parsers.parse_stockholm(chunk['sto'])
e_values_dict = parsers.parse_e_values_from_tblout(chunk['tbl'])
# Jackhmmer lists sequences as <sequence name>/<residue from>-<residue to>.
e_values = [e_values_dict[t.partition('/')[0]] for t in msa.descriptions]
chunk_results = zip(
msa.sequences, msa.deletion_matrix, msa.descriptions, e_values)
if chunk_index != 0:
next(chunk_results) # Only take query (first hit) from the first chunk.
unsorted_results.extend(chunk_results)
sorted_by_evalue = sorted(unsorted_results, key=lambda x: x[-1])
merged_sequences, merged_deletion_matrix, merged_descriptions, _ = zip(
*sorted_by_evalue)
merged_msa = parsers.Msa(sequences=merged_sequences,
deletion_matrix=merged_deletion_matrix,
descriptions=merged_descriptions)
if max_hits is not None:
merged_msa = merged_msa.truncate(max_seqs=max_hits)
return merged_msa