WGS Variant Calling

Overview

Whole-genome sequencing (WGS) variant calling identifies single-nucleotide variants (SNVs), insertions, and deletions (indels) from short-read sequencing data. The workflow follows the GATK Best Practices for germline variant discovery: reads are trimmed, aligned to a reference genome with BWA-MEM2, deduplicated, and passed through GATK HaplotypeCaller in GVCF mode. Per-sample GVCFs are combined and jointly genotyped, followed by variant quality score recalibration (VQSR) and functional annotation with Ensembl VEP.

Pipeline Steps

  1. Trim adapters and low-quality bases with fastp.

  2. Align reads to the reference genome using BWA-MEM2.

  3. Sort and index alignments with samtools.

  4. Mark PCR duplicates with sambamba.

  5. Call variants per sample in GVCF mode using GATK HaplotypeCaller.

  6. Combine per-sample GVCFs with GATK CombineGVCFs.

  7. Jointly genotype all samples with GATK GenotypeGVCFs.

  8. Apply variant quality score recalibration (VQSR) for SNPs.

  9. Annotate variants with Ensembl VEP.

  10. Aggregate QC metrics into a MultiQC report.

Implementation

# Snakefile -- WGS Germline Variant Calling (GATK Best Practices)
configfile: "config.yaml"

SAMPLES = config["samples"]
REF     = config["reference"]
KNOWN   = config["known_sites"]

rule all:
    input:
        "results/annotation/all_samples.annotated.vcf",
        "results/multiqc/multiqc_report.html"

rule fastp_trim:
    input:
        r1="data/{sample}_R1.fastq.gz",
        r2="data/{sample}_R2.fastq.gz"
    output:
        r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
        r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
        json="results/trimmed/{sample}_fastp.json",
        html="results/trimmed/{sample}_fastp.html"
    threads: 4
    conda: "envs/qc.yaml"
    shell:
        """
        fastp -i {input.r1} -I {input.r2} \
          -o {output.r1} -O {output.r2} \
          -j {output.json} -h {output.html} \
          --detect_adapter_for_pe --thread {threads}
        """

rule bwa_mem2_align:
    input:
        r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
        r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
        ref=REF
    output:
        bam="results/aligned/{sample}.sorted.bam"
    threads: 16
    params:
        rg=lambda wc: f"@RG\\tID:{wc.sample}\\tSM:{wc.sample}\\tPL:ILLUMINA"
    conda: "envs/align.yaml"
    shell:
        """
        bwa-mem2 mem -t {threads} -R '{params.rg}' {input.ref} \
          {input.r1} {input.r2} \
          | samtools sort -@ 4 -o {output.bam} -
        samtools index {output.bam}
        """

rule mark_duplicates:
    input:
        "results/aligned/{sample}.sorted.bam"
    output:
        bam="results/dedup/{sample}.dedup.bam",
        metrics="results/dedup/{sample}.dup_metrics.txt"
    threads: 8
    conda: "envs/align.yaml"
    shell:
        """
        sambamba markdup -t {threads} \
          {input} {output.bam} 2> {output.metrics}
        """

rule haplotype_caller:
    input:
        bam="results/dedup/{sample}.dedup.bam",
        ref=REF
    output:
        gvcf="results/gvcf/{sample}.g.vcf.gz"
    threads: 4
    conda: "envs/gatk.yaml"
    shell:
        """
        gatk HaplotypeCaller \
          -R {input.ref} -I {input.bam} \
          -O {output.gvcf} -ERC GVCF \
          --native-pair-hmm-threads {threads}
        """

rule combine_gvcfs:
    input:
        gvcfs=expand("results/gvcf/{sample}.g.vcf.gz",
                      sample=SAMPLES),
        ref=REF
    output:
        "results/genotyped/combined.g.vcf.gz"
    params:
        gvcf_args=lambda wc, input: " ".join(
            [f"-V {g}" for g in input.gvcfs])
    conda: "envs/gatk.yaml"
    shell:
        """
        gatk CombineGVCFs -R {input.ref} {params.gvcf_args} \
          -O {output}
        """

rule genotype_gvcfs:
    input:
        gvcf="results/genotyped/combined.g.vcf.gz",
        ref=REF
    output:
        "results/genotyped/all_samples.vcf.gz"
    conda: "envs/gatk.yaml"
    shell:
        """
        gatk GenotypeGVCFs -R {input.ref} -V {input.gvcf} \
          -O {output}
        """

rule vqsr_snps:
    input:
        vcf="results/genotyped/all_samples.vcf.gz",
        ref=REF
    output:
        recal="results/vqsr/snps.recal",
        tranches="results/vqsr/snps.tranches",
        vcf="results/vqsr/snps_recalibrated.vcf.gz"
    params:
        hapmap=config["hapmap"],
        omni=config["omni"],
        g1k=config["g1k_snps"],
        dbsnp=config["dbsnp"]
    conda: "envs/gatk.yaml"
    shell:
        """
        gatk VariantRecalibrator -R {input.ref} -V {input.vcf} \
          --resource:hapmap,known=false,training=true,truth=true,prior=15.0 {params.hapmap} \
          --resource:omni,known=false,training=true,truth=false,prior=12.0 {params.omni} \
          --resource:1000G,known=false,training=true,truth=false,prior=10.0 {params.g1k} \
          --resource:dbsnp,known=true,training=false,truth=false,prior=2.0 {params.dbsnp} \
          -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \
          -mode SNP -O {output.recal} --tranches-file {output.tranches}

        gatk ApplyVQSR -R {input.ref} -V {input.vcf} \
          --recal-file {output.recal} --tranches-file {output.tranches} \
          --truth-sensitivity-filter-level 99.5 -mode SNP \
          -O {output.vcf}
        """

rule vep_annotate:
    input:
        "results/vqsr/snps_recalibrated.vcf.gz"
    output:
        "results/annotation/all_samples.annotated.vcf"
    threads: 4
    conda: "envs/annotation.yaml"
    shell:
        """
        vep --input_file {input} --output_file {output} \
          --format vcf --vcf --fork {threads} \
          --cache --offline --assembly GRCh38 \
          --sift b --polyphen b --symbol --numbers \
          --canonical --biotype --af --af_1kg --af_gnomade
        """

rule multiqc:
    input:
        expand("results/trimmed/{sample}_fastp.json", sample=SAMPLES),
        expand("results/dedup/{sample}.dup_metrics.txt", sample=SAMPLES)
    output:
        "results/multiqc/multiqc_report.html"
    conda: "envs/qc.yaml"
    shell:
        "multiqc results/ -o results/multiqc/ --force"

See Also