def deduplicate_stockholm_msa()

in src/analysis/parsers.py [0:0]


def deduplicate_stockholm_msa(stockholm_msa: str) -> str:
  """Remove duplicate sequences (ignoring insertions wrt query)."""
  sequence_dict = collections.defaultdict(str)

  # First we must extract all sequences from the MSA.
  for line in stockholm_msa.splitlines():
    # Only consider the alignments - ignore reference annotation, empty lines,
    # descriptions or markup.
    if line.strip() and not line.startswith(('#', '//')):
      line = line.strip()
      seqname, alignment = line.split()
      sequence_dict[seqname] += alignment

  seen_sequences = set()
  seqnames = set()
  # First alignment is the query.
  query_align = next(iter(sequence_dict.values()))
  mask = [c != '-' for c in query_align]  # Mask is False for insertions.
  for seqname, alignment in sequence_dict.items():
    # Apply mask to remove all insertions from the string.
    masked_alignment = ''.join(itertools.compress(alignment, mask))
    if masked_alignment in seen_sequences:
      continue
    else:
      seen_sequences.add(masked_alignment)
      seqnames.add(seqname)

  filtered_lines = []
  for line in stockholm_msa.splitlines():
    if _keep_line(line, seqnames):
      filtered_lines.append(line)

  return '\n'.join(filtered_lines) + '\n'