def convert_alleles_to_genotypes()

in aws-blog-athena-genomics/etl/thousand_genomes/create_trimmed_parquet.py [0:0]


def convert_alleles_to_genotypes(record):
    '''
    Removes phasing information currently
    :param record: input record
    :return: reduced record that will be used for analysis
    '''
    allele_to_genotype = {'Ref': '0', 'Alt': '1', 'OtherAlt': 2}
    alleles = record.alleles
    genotypes = sorted([allele_to_genotype[_] for _ in alleles if _ in allele_to_genotype])
    # Ignore when len(genotypes is 0) or all are reference calls
    if not(len(genotypes) == 0 or (len(genotypes) == 1 and genotypes[0] == '0') or
            (len(genotypes) == 2 and genotypes[0] == genotypes[1] and genotypes[0] == '0')):
        if len(genotypes) == 1:
            gt0 = genotypes[0]
            gt1 = None
        else:
            gt0 = genotypes[0]
            gt1 = genotypes[1]
        yield {
            'chromosome': record['chromosome'],
            'startposition': record['start'],
            'endposition': record['end'],
            'referenceallele': record['ref'],
            'alternateallele': record['alt'],
            'sampleid': record['sampleId'],
            'genotype0': gt0,
            'genotype1': gt1
        }