Source code for varona.utils.fix_vcf

"""VCF file utilities, mainly to fix problems.
"""

import importlib.resources as pkg_resources

import pysam

from varona import ensembl


[docs] def vcf_header_inject_contigs( header: pysam.VariantHeader, assembly: ensembl.Assembly ) -> pysam.VariantHeader: """Inject contigs into a VCF header. Some VCF files do not have contigs in the header. This causes problems with pysam when trying to write out a new VCF. This function shoe-horns contigs into the headers. .. warning:: This function only works for human assemblies GRCh37 and GRCh38, and for GRCh37 it uses the 1000 genomes project contigs. For that reason, non-1000-genome-project-GRCh37 VCF files are not quite compatible at this time. :param header: The VCF header. :return: The VCF header with contigs injected. """ ret = header.copy() if assembly == ensembl.Assembly.GRCH37: data_path = "human_g1k_v37.fasta.fai" elif assembly == ensembl.Assembly.GRCH38: data_path = "human_grch38.fasta.fai" else: raise ValueError(f"Unknown assembly: {assembly}") with pkg_resources.open_text("varona.data", "human_g1k_v37.fasta.fai") as f: contigs = {} for line in f: name, length = line.split("\t")[:2] contigs[name] = int(length) # check if the contigs are already in the header for name, length in contigs.items(): if name in ret.contigs: if ret.contigs[name] != length: raise ValueError( f"Contig {name} already in header with different length" ) else: ret.contigs.add(name, length) return ret