WGS Variant Calling
Overview
Whole-genome sequencing (WGS) variant calling identifies single-nucleotide variants (SNVs), insertions, and deletions (indels) from short-read sequencing data. The workflow follows the GATK Best Practices for germline variant discovery: reads are trimmed, aligned to a reference genome with BWA-MEM2, deduplicated, and passed through GATK HaplotypeCaller in GVCF mode. Per-sample GVCFs are combined and jointly genotyped, followed by variant quality score recalibration (VQSR) and functional annotation with Ensembl VEP.
Pipeline Steps
Trim adapters and low-quality bases with fastp.
Align reads to the reference genome using BWA-MEM2.
Sort and index alignments with samtools.
Mark PCR duplicates with sambamba.
Call variants per sample in GVCF mode using GATK HaplotypeCaller.
Combine per-sample GVCFs with GATK CombineGVCFs.
Jointly genotype all samples with GATK GenotypeGVCFs.
Apply variant quality score recalibration (VQSR) for SNPs.
Annotate variants with Ensembl VEP.
Aggregate QC metrics into a MultiQC report.
Implementation
# Snakefile -- WGS Germline Variant Calling (GATK Best Practices)
configfile: "config.yaml"
SAMPLES = config["samples"]
REF = config["reference"]
KNOWN = config["known_sites"]
rule all:
input:
"results/annotation/all_samples.annotated.vcf",
"results/multiqc/multiqc_report.html"
rule fastp_trim:
input:
r1="data/{sample}_R1.fastq.gz",
r2="data/{sample}_R2.fastq.gz"
output:
r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
json="results/trimmed/{sample}_fastp.json",
html="results/trimmed/{sample}_fastp.html"
threads: 4
conda: "envs/qc.yaml"
shell:
"""
fastp -i {input.r1} -I {input.r2} \
-o {output.r1} -O {output.r2} \
-j {output.json} -h {output.html} \
--detect_adapter_for_pe --thread {threads}
"""
rule bwa_mem2_align:
input:
r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
ref=REF
output:
bam="results/aligned/{sample}.sorted.bam"
threads: 16
params:
rg=lambda wc: f"@RG\\tID:{wc.sample}\\tSM:{wc.sample}\\tPL:ILLUMINA"
conda: "envs/align.yaml"
shell:
"""
bwa-mem2 mem -t {threads} -R '{params.rg}' {input.ref} \
{input.r1} {input.r2} \
| samtools sort -@ 4 -o {output.bam} -
samtools index {output.bam}
"""
rule mark_duplicates:
input:
"results/aligned/{sample}.sorted.bam"
output:
bam="results/dedup/{sample}.dedup.bam",
metrics="results/dedup/{sample}.dup_metrics.txt"
threads: 8
conda: "envs/align.yaml"
shell:
"""
sambamba markdup -t {threads} \
{input} {output.bam} 2> {output.metrics}
"""
rule haplotype_caller:
input:
bam="results/dedup/{sample}.dedup.bam",
ref=REF
output:
gvcf="results/gvcf/{sample}.g.vcf.gz"
threads: 4
conda: "envs/gatk.yaml"
shell:
"""
gatk HaplotypeCaller \
-R {input.ref} -I {input.bam} \
-O {output.gvcf} -ERC GVCF \
--native-pair-hmm-threads {threads}
"""
rule combine_gvcfs:
input:
gvcfs=expand("results/gvcf/{sample}.g.vcf.gz",
sample=SAMPLES),
ref=REF
output:
"results/genotyped/combined.g.vcf.gz"
params:
gvcf_args=lambda wc, input: " ".join(
[f"-V {g}" for g in input.gvcfs])
conda: "envs/gatk.yaml"
shell:
"""
gatk CombineGVCFs -R {input.ref} {params.gvcf_args} \
-O {output}
"""
rule genotype_gvcfs:
input:
gvcf="results/genotyped/combined.g.vcf.gz",
ref=REF
output:
"results/genotyped/all_samples.vcf.gz"
conda: "envs/gatk.yaml"
shell:
"""
gatk GenotypeGVCFs -R {input.ref} -V {input.gvcf} \
-O {output}
"""
rule vqsr_snps:
input:
vcf="results/genotyped/all_samples.vcf.gz",
ref=REF
output:
recal="results/vqsr/snps.recal",
tranches="results/vqsr/snps.tranches",
vcf="results/vqsr/snps_recalibrated.vcf.gz"
params:
hapmap=config["hapmap"],
omni=config["omni"],
g1k=config["g1k_snps"],
dbsnp=config["dbsnp"]
conda: "envs/gatk.yaml"
shell:
"""
gatk VariantRecalibrator -R {input.ref} -V {input.vcf} \
--resource:hapmap,known=false,training=true,truth=true,prior=15.0 {params.hapmap} \
--resource:omni,known=false,training=true,truth=false,prior=12.0 {params.omni} \
--resource:1000G,known=false,training=true,truth=false,prior=10.0 {params.g1k} \
--resource:dbsnp,known=true,training=false,truth=false,prior=2.0 {params.dbsnp} \
-an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \
-mode SNP -O {output.recal} --tranches-file {output.tranches}
gatk ApplyVQSR -R {input.ref} -V {input.vcf} \
--recal-file {output.recal} --tranches-file {output.tranches} \
--truth-sensitivity-filter-level 99.5 -mode SNP \
-O {output.vcf}
"""
rule vep_annotate:
input:
"results/vqsr/snps_recalibrated.vcf.gz"
output:
"results/annotation/all_samples.annotated.vcf"
threads: 4
conda: "envs/annotation.yaml"
shell:
"""
vep --input_file {input} --output_file {output} \
--format vcf --vcf --fork {threads} \
--cache --offline --assembly GRCh38 \
--sift b --polyphen b --symbol --numbers \
--canonical --biotype --af --af_1kg --af_gnomade
"""
rule multiqc:
input:
expand("results/trimmed/{sample}_fastp.json", sample=SAMPLES),
expand("results/dedup/{sample}.dup_metrics.txt", sample=SAMPLES)
output:
"results/multiqc/multiqc_report.html"
conda: "envs/qc.yaml"
shell:
"multiqc results/ -o results/multiqc/ --force"
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
params.reads = "data/*_{R1,R2}.fastq.gz"
params.ref = "ref/GRCh38.fa"
params.outdir = "results"
params.dbsnp = "ref/dbsnp_156.vcf.gz"
process FASTP {
tag "${sample_id}"
publishDir "${params.outdir}/trimmed", mode: 'copy'
cpus 4
input: tuple val(sample_id), path(reads)
output:
tuple val(sample_id), path("*_trimmed.fastq.gz"), emit: trimmed
path("*.json"), emit: json
script:
def (r1, r2) = reads
"""
fastp -i ${r1} -I ${r2} \
-o ${sample_id}_R1_trimmed.fastq.gz \
-O ${sample_id}_R2_trimmed.fastq.gz \
-j ${sample_id}_fastp.json \
--detect_adapter_for_pe --thread ${task.cpus}
"""
}
process BWA_MEM2 {
tag "${sample_id}"
publishDir "${params.outdir}/aligned", mode: 'copy'
cpus 16
memory '32 GB'
input:
tuple val(sample_id), path(reads)
path(ref)
output:
tuple val(sample_id), path("*.sorted.bam"), path("*.sorted.bam.bai"), emit: bam
script:
def (r1, r2) = reads
"""
bwa-mem2 mem -t ${task.cpus} \
-R "@RG\\tID:${sample_id}\\tSM:${sample_id}\\tPL:ILLUMINA" \
${ref} ${r1} ${r2} \
| samtools sort -@ 4 -o ${sample_id}.sorted.bam -
samtools index ${sample_id}.sorted.bam
"""
}
process MARKDUP {
tag "${sample_id}"
publishDir "${params.outdir}/dedup", mode: 'copy'
cpus 8
input: tuple val(sample_id), path(bam), path(bai)
output: tuple val(sample_id), path("*.dedup.bam"), path("*.dedup.bam.bai"), emit: bam
script:
"""
sambamba markdup -t ${task.cpus} ${bam} ${sample_id}.dedup.bam
samtools index ${sample_id}.dedup.bam
"""
}
process HAPLOTYPE_CALLER {
tag "${sample_id}"
publishDir "${params.outdir}/gvcf", mode: 'copy'
cpus 4
memory '16 GB'
input:
tuple val(sample_id), path(bam), path(bai)
path(ref)
output: tuple val(sample_id), path("*.g.vcf.gz"), path("*.g.vcf.gz.tbi"), emit: gvcf
script:
"""
gatk HaplotypeCaller -R ${ref} -I ${bam} \
-O ${sample_id}.g.vcf.gz -ERC GVCF \
--native-pair-hmm-threads ${task.cpus}
"""
}
process JOINT_GENOTYPE {
publishDir "${params.outdir}/genotyped", mode: 'copy'
memory '32 GB'
input:
path(gvcfs)
path(tbis)
path(ref)
output: path("all_samples.vcf.gz"), emit: vcf
script:
def gvcf_args = gvcfs.collect { "-V ${it}" }.join(' ')
"""
gatk CombineGVCFs -R ${ref} ${gvcf_args} -O combined.g.vcf.gz
gatk GenotypeGVCFs -R ${ref} -V combined.g.vcf.gz \
-O all_samples.vcf.gz
"""
}
process VEP_ANNOTATE {
publishDir "${params.outdir}/annotation", mode: 'copy'
cpus 4
input: path(vcf)
output: path("annotated.vcf")
script:
"""
vep --input_file ${vcf} --output_file annotated.vcf \
--format vcf --vcf --fork ${task.cpus} \
--cache --offline --assembly GRCh38 \
--sift b --polyphen b --symbol --canonical --af_gnomade
"""
}
workflow {
reads_ch = Channel.fromFilePairs(params.reads, checkIfExists: true)
ref_ch = Channel.value(file(params.ref))
FASTP(reads_ch)
BWA_MEM2(FASTP.out.trimmed, ref_ch)
MARKDUP(BWA_MEM2.out.bam)
HAPLOTYPE_CALLER(MARKDUP.out.bam, ref_ch)
gvcfs = HAPLOTYPE_CALLER.out.gvcf.map { it[1] }.collect()
tbis = HAPLOTYPE_CALLER.out.gvcf.map { it[2] }.collect()
JOINT_GENOTYPE(gvcfs, tbis, ref_ch)
VEP_ANNOTATE(JOINT_GENOTYPE.out.vcf)
}
See Also
Snakemake – Snakemake workflow manager overview
Nextflow – Nextflow DSL2 workflow manager overview
GATK – GATK variant caller reference
BWA-MEM2 – BWA-MEM2 aligner reference
VEP (Variant Effect Predictor) – Ensembl VEP annotation reference