"""High-level routines for the Varona library.
All of the functions and classes in this module are imported into the
top-level library namespace.
"""
import logging
import pathlib
import typing
import httpx
import polars as pl
import pysam
from varona import ensembl
logger = logging.getLogger("varona.varona")
def _vcf_rows(
vcf_path: pathlib.Path, vcf_extractor: typing.Callable[[pysam.VariantRecord], dict]
):
"""Helper function to extract rows from a VCF file.
:param vcf_path: The path to the VCF file.
:param vcf_extractor: The function to extract data from the VCF.
:yields: A dictionary of extracted data from the VCF.
"""
with pysam.VariantFile(vcf_path, "r") as vf:
for record in vf:
new_item = vcf_extractor(record)
yield new_item
[docs]
def vcf_dataframe(
vcf_path: pathlib.Path,
vcf_extractor: typing.Callable[[pysam.VariantRecord], dict],
schema: dict[str, typing.Any] | pl.Schema | None = None,
) -> pl.DataFrame:
"""From the records in a VCF file, make a dataframe given an extractor.
.. code-block:: python
import pathlib
import polars as pl
import pysam
from varona import vcf_dataframe
def example_extractor(record: pysam.VariantRecord) -> dict:
return {
"contig": record.contig,
"pos": record.pos,
"ref": record.ref,
"alt": record.alts[0]
}
# Make a DataFrame from the VCF file. The columns laid out by
# the extractor function.
vcf_path = pathlib.Path("/path/to/file.vcf")
df = vcf_dataframe(vcf_path, example_extractor)
print(df)
##shape: (5, 4)
##┌────────┬─────────┬─────┬─────┐
##│ contig ┆ pos ┆ ref ┆ alt │
##│ --- ┆ --- ┆ --- ┆ --- │
##│ str ┆ i64 ┆ str ┆ str │
##╞════════╪═════════╪═════╪═════╡
##│ 1 ┆ 1158631 ┆ A ┆ G │
##│ 1 ┆ 1246004 ┆ A ┆ G │
##│ 1 ┆ 1249187 ┆ G ┆ A │
##│ 1 ┆ 1261824 ┆ G ┆ C │
##│ 1 ┆ 1387667 ┆ C ┆ G │
##└────────┴─────────┴─────┴─────┘
:param vcf_path: The path to the VCF file.
:param vcf_extractor: The function to extract data from the VCF.
:param schema: Optional schema for the DataFrame to help enforce column types.
:return: DataFrame with the extracted data.
"""
return pl.LazyFrame(_vcf_rows(vcf_path, vcf_extractor), schema=schema).collect()
[docs]
def vep_api_dataframe(
client: httpx.Client,
loci_list: list[str],
genome_assembly: ensembl.Assembly,
api_extractor: typing.Callable[[dict], dict],
schema: dict[str, typing.Any] | None = None,
) -> pl.DataFrame:
"""Query the Ensembl VEP API and make a DataFrame using a provided extractor.
Like :func:`vcf_dataframe`, this is a vehicle for a custom extractor function
to be used on the response dictionaries from the Ensembl VEP API. Below is an
example of how to use this function. A :class:`httpx.Client` still needs to
be supplied.
.. code-block:: python
import pathlib
import polars as pl
import httpx
from varona import vep_api_dataframe, ensembl
def example_extractor(response: dict) -> dict:
return {
"contig": response["seq_region_name"],
"pos": response["start"],
"type": response["variant_class"]
}
loci_list = [
"1 1158631 . A G . . .",
"1 91859795 . TATGTGA CATGTGA,CATGTGG . . .",
]
with httpx.Client(
limits=httpx.Limits(
max_connections=5,
max_keepalive_connections=5
),
timeout=httpx.Timeout(float(300)),
) as client:
api_df = vep_api_dataframe(
client,
loci_list,
ensembl.Assembly.GRCH37,
example_extractor
)
print(api_df)
##shape: (2, 3)
##┌────────┬──────────┬──────────────┐
##│ contig ┆ pos ┆ type │
##│ --- ┆ --- ┆ --- │
##│ str ┆ i64 ┆ str │
##╞════════╪══════════╪══════════════╡
##│ 1 ┆ 1158631 ┆ SNV │
##│ 1 ┆ 91859795 ┆ substitution │
##└────────┴──────────┴──────────────┘
:param client: The HTTPX client to use for the API query.
:param loci_list: The list of loci to query the API.
:param genome_assembly: The genome assembly used in the Ensembl VEP API.
:param api_extractor: The function to extract data from the VEP API response.
:param schema: Optional schema for the DataFrame to help enforce column types.
:return: A DataFrame with the data from the VEP API.
"""
data = ensembl.query_vep_api(
client, loci_list, genome_assembly, response_extractor=api_extractor
)
return pl.LazyFrame(data, schema=schema).collect()