def parse_hmmsearch_a3m()

in src/analysis/parsers.py [0:0]


def parse_hmmsearch_a3m(query_sequence: str,
                        a3m_string: str,
                        skip_first: bool = True) -> Sequence[TemplateHit]:
  """Parses an a3m string produced by hmmsearch.

  Args:
    query_sequence: The query sequence.
    a3m_string: The a3m string produced by hmmsearch.
    skip_first: Whether to skip the first sequence in the a3m string.

  Returns:
    A sequence of `TemplateHit` results.
  """
  # Zip the descriptions and MSAs together, skip the first query sequence.
  parsed_a3m = list(zip(*parse_fasta(a3m_string)))
  if skip_first:
    parsed_a3m = parsed_a3m[1:]

  indices_query = _get_indices(query_sequence, start=0)

  hits = []
  for i, (hit_sequence, hit_description) in enumerate(parsed_a3m, start=1):
    if 'mol:protein' not in hit_description:
      continue  # Skip non-protein chains.
    metadata = _parse_hmmsearch_description(hit_description)
    # Aligned columns are only the match states.
    aligned_cols = sum([r.isupper() and r != '-' for r in hit_sequence])
    indices_hit = _get_indices(hit_sequence, start=metadata.start - 1)

    hit = TemplateHit(
        index=i,
        name=f'{metadata.pdb_id}_{metadata.chain}',
        aligned_cols=aligned_cols,
        sum_probs=None,
        query=query_sequence,
        hit_sequence=hit_sequence.upper(),
        indices_query=indices_query,
        indices_hit=indices_hit,
    )
    hits.append(hit)

  return hits