in aws-blog-athena-genomics/etl/thousand_genomes/create_trimmed_parquet.py [0:0]
def convert_alleles_to_genotypes(record):
'''
Removes phasing information currently
:param record: input record
:return: reduced record that will be used for analysis
'''
allele_to_genotype = {'Ref': '0', 'Alt': '1', 'OtherAlt': 2}
alleles = record.alleles
genotypes = sorted([allele_to_genotype[_] for _ in alleles if _ in allele_to_genotype])
# Ignore when len(genotypes is 0) or all are reference calls
if not(len(genotypes) == 0 or (len(genotypes) == 1 and genotypes[0] == '0') or
(len(genotypes) == 2 and genotypes[0] == genotypes[1] and genotypes[0] == '0')):
if len(genotypes) == 1:
gt0 = genotypes[0]
gt1 = None
else:
gt0 = genotypes[0]
gt1 = genotypes[1]
yield {
'chromosome': record['chromosome'],
'startposition': record['start'],
'endposition': record['end'],
'referenceallele': record['ref'],
'alternateallele': record['alt'],
'sampleid': record['sampleId'],
'genotype0': gt0,
'genotype1': gt1
}