"""Minor allele frequency (MAF) calculation.
This module provides a few competing strategies for obtaining MAF data from
a VFC file.
"""
import collections
import pysam
from varona import bcftools, enum
[docs]
class MafMethod(enum.CiStrEnum):
"""Enum for the MAF calculation method.
Values:
+-----------+----------------------------------------------+
| Value | Description |
+===========+==============================================+
| FR | Use the FR field in the info section of the |
| | variant to calculate the MAF. |
+-----------+----------------------------------------------+
| BCFTOOLS | Only available if non-pysam bcftools is |
| | installed. Use the MAF |
| | |
| | field in the info section of the variant to |
| | calculate the MAF, which |
| | |
| | is added by ``bcftools +fill-tags``. |
+-----------+----------------------------------------------+
| SAMPLES | Description of SAMPLES method |
+-----------+----------------------------------------------+
Note: The BCFTOOLS value is conditionally available based on the
presence of non-pysam bcftools.
"""
FR = "FR"
SAMPLES = "SAMPLES"
# Conditionally add the INFO method if non-pysam bcftools
# is available.
if bcftools.HAVE_BCFTOOLS:
BCFTOOLS = "BCFTOOLS"
[docs]
def maf_from_fr(variant: pysam.VariantRecord) -> float:
"""Compute the MAF from the "FR" info value in a variant.
:param variant: A :class:`pysam.VariantRecord` object.
:return: The MAF value derived from the "FR" info value.
:raises: :class:`KeyError` if the "FR" key is not found in the info section.
"""
# the FR values focus on the alt alleles but we need the
# reference allele frequency too
af_list = list(variant.info["FR"])
ref_af = 1 - sum(af_list)
af_list.append(ref_af)
af_list.sort(reverse=True)
# it'll be the second-highest
return af_list[1]
[docs]
def maf_from_samples(variant: pysam.VariantRecord) -> float:
"""Compute the MAF from the sample genotypes in a variant.
This function assumes that the genotypes are in the "GT" key of the
samples section of the variant.
:param variant: A :class:`pysam.VariantRecord` object.
:return: The MAF value.
:raises: KeyError if the "GT" key is not found in the samples section.
"""
var_allele_count = len(variant.alleles)
samp_allele_counts = collections.Counter()
# for some reason, iterating over samples just gives the sample name
for ix in range(len(variant.samples)):
samp_allele_counts.update(variant.samples[ix]["GT"])
num_alleles = samp_allele_counts.total()
af_list = [
float(samp_allele_counts.get(i, 0)) / float(num_alleles)
for i in range(var_allele_count)
]
af_list.sort(reverse=True)
return af_list[1]
[docs]
def maf_from_method(variant: pysam.VariantRecord, method: MafMethod) -> float:
"""A dispatcher for :func:`maf_from_fr`, :func:`maf_from_bcftools`, and :func:`maf_from_samples`.
:param variant: A :class:`pysam.VariantRecord` object.
:param method: The MAF calculation method.
:return: The MAF value.
:raises: KeyError if the method is not recognized.
"""
if method == MafMethod.FR:
return maf_from_fr(variant)
elif method == MafMethod.BCFTOOLS:
return maf_from_bcftools(variant)
return maf_from_samples(variant)