WGS Variant Calling
===================

Overview
--------

Whole-genome sequencing (WGS) variant calling identifies single-nucleotide
variants (SNVs), insertions, and deletions (indels) from short-read sequencing
data. The workflow follows the GATK Best Practices for germline variant
discovery: reads are trimmed, aligned to a reference genome with BWA-MEM2,
deduplicated, and passed through GATK HaplotypeCaller in GVCF mode. Per-sample
GVCFs are combined and jointly genotyped, followed by variant quality score
recalibration (VQSR) and functional annotation with Ensembl VEP.

Pipeline Steps
--------------

1. Trim adapters and low-quality bases with fastp.
2. Align reads to the reference genome using BWA-MEM2.
3. Sort and index alignments with samtools.
4. Mark PCR duplicates with sambamba.
5. Call variants per sample in GVCF mode using GATK HaplotypeCaller.
6. Combine per-sample GVCFs with GATK CombineGVCFs.
7. Jointly genotype all samples with GATK GenotypeGVCFs.
8. Apply variant quality score recalibration (VQSR) for SNPs.
9. Annotate variants with Ensembl VEP.
10. Aggregate QC metrics into a MultiQC report.

Implementation
--------------

.. tabs::

   .. tab:: Snakemake

      .. code-block:: python

         # Snakefile -- WGS Germline Variant Calling (GATK Best Practices)
         configfile: "config.yaml"

         SAMPLES = config["samples"]
         REF     = config["reference"]
         KNOWN   = config["known_sites"]

         rule all:
             input:
                 "results/annotation/all_samples.annotated.vcf",
                 "results/multiqc/multiqc_report.html"

         rule fastp_trim:
             input:
                 r1="data/{sample}_R1.fastq.gz",
                 r2="data/{sample}_R2.fastq.gz"
             output:
                 r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
                 r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
                 json="results/trimmed/{sample}_fastp.json",
                 html="results/trimmed/{sample}_fastp.html"
             threads: 4
             conda: "envs/qc.yaml"
             shell:
                 """
                 fastp -i {input.r1} -I {input.r2} \
                   -o {output.r1} -O {output.r2} \
                   -j {output.json} -h {output.html} \
                   --detect_adapter_for_pe --thread {threads}
                 """

         rule bwa_mem2_align:
             input:
                 r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
                 r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
                 ref=REF
             output:
                 bam="results/aligned/{sample}.sorted.bam"
             threads: 16
             params:
                 rg=lambda wc: f"@RG\\tID:{wc.sample}\\tSM:{wc.sample}\\tPL:ILLUMINA"
             conda: "envs/align.yaml"
             shell:
                 """
                 bwa-mem2 mem -t {threads} -R '{params.rg}' {input.ref} \
                   {input.r1} {input.r2} \
                   | samtools sort -@ 4 -o {output.bam} -
                 samtools index {output.bam}
                 """

         rule mark_duplicates:
             input:
                 "results/aligned/{sample}.sorted.bam"
             output:
                 bam="results/dedup/{sample}.dedup.bam",
                 metrics="results/dedup/{sample}.dup_metrics.txt"
             threads: 8
             conda: "envs/align.yaml"
             shell:
                 """
                 sambamba markdup -t {threads} \
                   {input} {output.bam} 2> {output.metrics}
                 """

         rule haplotype_caller:
             input:
                 bam="results/dedup/{sample}.dedup.bam",
                 ref=REF
             output:
                 gvcf="results/gvcf/{sample}.g.vcf.gz"
             threads: 4
             conda: "envs/gatk.yaml"
             shell:
                 """
                 gatk HaplotypeCaller \
                   -R {input.ref} -I {input.bam} \
                   -O {output.gvcf} -ERC GVCF \
                   --native-pair-hmm-threads {threads}
                 """

         rule combine_gvcfs:
             input:
                 gvcfs=expand("results/gvcf/{sample}.g.vcf.gz",
                               sample=SAMPLES),
                 ref=REF
             output:
                 "results/genotyped/combined.g.vcf.gz"
             params:
                 gvcf_args=lambda wc, input: " ".join(
                     [f"-V {g}" for g in input.gvcfs])
             conda: "envs/gatk.yaml"
             shell:
                 """
                 gatk CombineGVCFs -R {input.ref} {params.gvcf_args} \
                   -O {output}
                 """

         rule genotype_gvcfs:
             input:
                 gvcf="results/genotyped/combined.g.vcf.gz",
                 ref=REF
             output:
                 "results/genotyped/all_samples.vcf.gz"
             conda: "envs/gatk.yaml"
             shell:
                 """
                 gatk GenotypeGVCFs -R {input.ref} -V {input.gvcf} \
                   -O {output}
                 """

         rule vqsr_snps:
             input:
                 vcf="results/genotyped/all_samples.vcf.gz",
                 ref=REF
             output:
                 recal="results/vqsr/snps.recal",
                 tranches="results/vqsr/snps.tranches",
                 vcf="results/vqsr/snps_recalibrated.vcf.gz"
             params:
                 hapmap=config["hapmap"],
                 omni=config["omni"],
                 g1k=config["g1k_snps"],
                 dbsnp=config["dbsnp"]
             conda: "envs/gatk.yaml"
             shell:
                 """
                 gatk VariantRecalibrator -R {input.ref} -V {input.vcf} \
                   --resource:hapmap,known=false,training=true,truth=true,prior=15.0 {params.hapmap} \
                   --resource:omni,known=false,training=true,truth=false,prior=12.0 {params.omni} \
                   --resource:1000G,known=false,training=true,truth=false,prior=10.0 {params.g1k} \
                   --resource:dbsnp,known=true,training=false,truth=false,prior=2.0 {params.dbsnp} \
                   -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \
                   -mode SNP -O {output.recal} --tranches-file {output.tranches}

                 gatk ApplyVQSR -R {input.ref} -V {input.vcf} \
                   --recal-file {output.recal} --tranches-file {output.tranches} \
                   --truth-sensitivity-filter-level 99.5 -mode SNP \
                   -O {output.vcf}
                 """

         rule vep_annotate:
             input:
                 "results/vqsr/snps_recalibrated.vcf.gz"
             output:
                 "results/annotation/all_samples.annotated.vcf"
             threads: 4
             conda: "envs/annotation.yaml"
             shell:
                 """
                 vep --input_file {input} --output_file {output} \
                   --format vcf --vcf --fork {threads} \
                   --cache --offline --assembly GRCh38 \
                   --sift b --polyphen b --symbol --numbers \
                   --canonical --biotype --af --af_1kg --af_gnomade
                 """

         rule multiqc:
             input:
                 expand("results/trimmed/{sample}_fastp.json", sample=SAMPLES),
                 expand("results/dedup/{sample}.dup_metrics.txt", sample=SAMPLES)
             output:
                 "results/multiqc/multiqc_report.html"
             conda: "envs/qc.yaml"
             shell:
                 "multiqc results/ -o results/multiqc/ --force"

   .. tab:: Nextflow

      .. code-block:: groovy

         #!/usr/bin/env nextflow
         nextflow.enable.dsl = 2

         params.reads     = "data/*_{R1,R2}.fastq.gz"
         params.ref       = "ref/GRCh38.fa"
         params.outdir    = "results"
         params.dbsnp     = "ref/dbsnp_156.vcf.gz"

         process FASTP {
             tag "${sample_id}"
             publishDir "${params.outdir}/trimmed", mode: 'copy'
             cpus 4
             input:  tuple val(sample_id), path(reads)
             output:
                 tuple val(sample_id), path("*_trimmed.fastq.gz"), emit: trimmed
                 path("*.json"), emit: json
             script:
             def (r1, r2) = reads
             """
             fastp -i ${r1} -I ${r2} \
               -o ${sample_id}_R1_trimmed.fastq.gz \
               -O ${sample_id}_R2_trimmed.fastq.gz \
               -j ${sample_id}_fastp.json \
               --detect_adapter_for_pe --thread ${task.cpus}
             """
         }

         process BWA_MEM2 {
             tag "${sample_id}"
             publishDir "${params.outdir}/aligned", mode: 'copy'
             cpus 16
             memory '32 GB'
             input:
                 tuple val(sample_id), path(reads)
                 path(ref)
             output:
                 tuple val(sample_id), path("*.sorted.bam"), path("*.sorted.bam.bai"), emit: bam
             script:
             def (r1, r2) = reads
             """
             bwa-mem2 mem -t ${task.cpus} \
               -R "@RG\\tID:${sample_id}\\tSM:${sample_id}\\tPL:ILLUMINA" \
               ${ref} ${r1} ${r2} \
               | samtools sort -@ 4 -o ${sample_id}.sorted.bam -
             samtools index ${sample_id}.sorted.bam
             """
         }

         process MARKDUP {
             tag "${sample_id}"
             publishDir "${params.outdir}/dedup", mode: 'copy'
             cpus 8
             input:  tuple val(sample_id), path(bam), path(bai)
             output: tuple val(sample_id), path("*.dedup.bam"), path("*.dedup.bam.bai"), emit: bam
             script:
             """
             sambamba markdup -t ${task.cpus} ${bam} ${sample_id}.dedup.bam
             samtools index ${sample_id}.dedup.bam
             """
         }

         process HAPLOTYPE_CALLER {
             tag "${sample_id}"
             publishDir "${params.outdir}/gvcf", mode: 'copy'
             cpus 4
             memory '16 GB'
             input:
                 tuple val(sample_id), path(bam), path(bai)
                 path(ref)
             output: tuple val(sample_id), path("*.g.vcf.gz"), path("*.g.vcf.gz.tbi"), emit: gvcf
             script:
             """
             gatk HaplotypeCaller -R ${ref} -I ${bam} \
               -O ${sample_id}.g.vcf.gz -ERC GVCF \
               --native-pair-hmm-threads ${task.cpus}
             """
         }

         process JOINT_GENOTYPE {
             publishDir "${params.outdir}/genotyped", mode: 'copy'
             memory '32 GB'
             input:
                 path(gvcfs)
                 path(tbis)
                 path(ref)
             output: path("all_samples.vcf.gz"), emit: vcf
             script:
             def gvcf_args = gvcfs.collect { "-V ${it}" }.join(' ')
             """
             gatk CombineGVCFs -R ${ref} ${gvcf_args} -O combined.g.vcf.gz
             gatk GenotypeGVCFs -R ${ref} -V combined.g.vcf.gz \
               -O all_samples.vcf.gz
             """
         }

         process VEP_ANNOTATE {
             publishDir "${params.outdir}/annotation", mode: 'copy'
             cpus 4
             input:  path(vcf)
             output: path("annotated.vcf")
             script:
             """
             vep --input_file ${vcf} --output_file annotated.vcf \
               --format vcf --vcf --fork ${task.cpus} \
               --cache --offline --assembly GRCh38 \
               --sift b --polyphen b --symbol --canonical --af_gnomade
             """
         }

         workflow {
             reads_ch = Channel.fromFilePairs(params.reads, checkIfExists: true)
             ref_ch   = Channel.value(file(params.ref))

             FASTP(reads_ch)
             BWA_MEM2(FASTP.out.trimmed, ref_ch)
             MARKDUP(BWA_MEM2.out.bam)
             HAPLOTYPE_CALLER(MARKDUP.out.bam, ref_ch)

             gvcfs = HAPLOTYPE_CALLER.out.gvcf.map { it[1] }.collect()
             tbis  = HAPLOTYPE_CALLER.out.gvcf.map { it[2] }.collect()

             JOINT_GENOTYPE(gvcfs, tbis, ref_ch)
             VEP_ANNOTATE(JOINT_GENOTYPE.out.vcf)
         }

See Also
--------

* :doc:`snakemake` -- Snakemake workflow manager overview
* :doc:`nextflow` -- Nextflow DSL2 workflow manager overview
* :doc:`/tools/variant-calling/gatk` -- GATK variant caller reference
* :doc:`/tools/alignment/bwa-mem2` -- BWA-MEM2 aligner reference
* :doc:`/tools/variant-annotation/vep` -- Ensembl VEP annotation reference