def remove_empty_columns_from_stockholm_msa()

in src/analysis/parsers.py [0:0]


def remove_empty_columns_from_stockholm_msa(stockholm_msa: str) -> str:
  """Removes empty columns (dashes-only) from a Stockholm MSA."""
  processed_lines = {}
  unprocessed_lines = {}
  for i, line in enumerate(stockholm_msa.splitlines()):
    if line.startswith('#=GC RF'):
      reference_annotation_i = i
      reference_annotation_line = line
      # Reached the end of this chunk of the alignment. Process chunk.
      _, _, first_alignment = line.rpartition(' ')
      mask = []
      for j in range(len(first_alignment)):
        for _, unprocessed_line in unprocessed_lines.items():
          prefix, _, alignment = unprocessed_line.rpartition(' ')
          if alignment[j] != '-':
            mask.append(True)
            break
        else:  # Every row contained a hyphen - empty column.
          mask.append(False)
      # Add reference annotation for processing with mask.
      unprocessed_lines[reference_annotation_i] = reference_annotation_line

      if not any(mask):  # All columns were empty. Output empty lines for chunk.
        for line_index in unprocessed_lines:
          processed_lines[line_index] = ''
      else:
        for line_index, unprocessed_line in unprocessed_lines.items():
          prefix, _, alignment = unprocessed_line.rpartition(' ')
          masked_alignment = ''.join(itertools.compress(alignment, mask))
          processed_lines[line_index] = f'{prefix} {masked_alignment}'

      # Clear raw_alignments.
      unprocessed_lines = {}
    elif line.strip() and not line.startswith(('#', '//')):
      unprocessed_lines[i] = line
    else:
      processed_lines[i] = line
  return '\n'.join((processed_lines[i] for i in range(len(processed_lines))))