WGS Variant Calling

Overview

Whole-genome sequencing (WGS) variant calling identifies single-nucleotide variants (SNVs), insertions, and deletions (indels) from short-read sequencing data. The workflow follows the GATK Best Practices for germline variant discovery: reads are trimmed, aligned to a reference genome with BWA-MEM2, deduplicated, and passed through GATK HaplotypeCaller in GVCF mode. Per-sample GVCFs are combined and jointly genotyped, followed by variant quality score recalibration (VQSR) and functional annotation with Ensembl VEP.

Pipeline Steps

Trim adapters and low-quality bases with fastp.
Align reads to the reference genome using BWA-MEM2.
Sort and index alignments with samtools.
Mark PCR duplicates with sambamba.
Call variants per sample in GVCF mode using GATK HaplotypeCaller.
Combine per-sample GVCFs with GATK CombineGVCFs.
Jointly genotype all samples with GATK GenotypeGVCFs.
Apply variant quality score recalibration (VQSR) for SNPs.
Annotate variants with Ensembl VEP.
Aggregate QC metrics into a MultiQC report.

Implementation

# Snakefile -- WGS Germline Variant Calling (GATK Best Practices)
configfile: "config.yaml"

SAMPLES = config["samples"]
REF     = config["reference"]
KNOWN   = config["known_sites"]

rule all:
    input:
        "results/annotation/all_samples.annotated.vcf",
        "results/multiqc/multiqc_report.html"

rule fastp_trim:
    input:
        r1="data/{sample}_R1.fastq.gz",
        r2="data/{sample}_R2.fastq.gz"
    output:
        r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
        r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
        json="results/trimmed/{sample}_fastp.json",
        html="results/trimmed/{sample}_fastp.html"
    threads: 4
    conda: "envs/qc.yaml"
    shell:
        """
        fastp -i {input.r1} -I {input.r2} \
          -o {output.r1} -O {output.r2} \
          -j {output.json} -h {output.html} \
          --detect_adapter_for_pe --thread {threads}
        """

rule bwa_mem2_align:
    input:
        r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
        r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
        ref=REF
    output:
        bam="results/aligned/{sample}.sorted.bam"
    threads: 16
    params:
        rg=lambda wc: f"@RG\\tID:{wc.sample}\\tSM:{wc.sample}\\tPL:ILLUMINA"
    conda: "envs/align.yaml"
    shell:
        """
        bwa-mem2 mem -t {threads} -R '{params.rg}' {input.ref} \
          {input.r1} {input.r2} \
          | samtools sort -@ 4 -o {output.bam} -
        samtools index {output.bam}
        """

rule mark_duplicates:
    input:
        "results/aligned/{sample}.sorted.bam"
    output:
        bam="results/dedup/{sample}.dedup.bam",
        metrics="results/dedup/{sample}.dup_metrics.txt"
    threads: 8
    conda: "envs/align.yaml"
    shell:
        """
        sambamba markdup -t {threads} \
          {input} {output.bam} 2> {output.metrics}
        """

rule haplotype_caller:
    input:
        bam="results/dedup/{sample}.dedup.bam",
        ref=REF
    output:
        gvcf="results/gvcf/{sample}.g.vcf.gz"
    threads: 4
    conda: "envs/gatk.yaml"
    shell:
        """
        gatk HaplotypeCaller \
          -R {input.ref} -I {input.bam} \
          -O {output.gvcf} -ERC GVCF \
          --native-pair-hmm-threads {threads}
        """

rule combine_gvcfs:
    input:
        gvcfs=expand("results/gvcf/{sample}.g.vcf.gz",
                      sample=SAMPLES),
        ref=REF
    output:
        "results/genotyped/combined.g.vcf.gz"
    params:
        gvcf_args=lambda wc, input: " ".join(
            [f"-V {g}" for g in input.gvcfs])
    conda: "envs/gatk.yaml"
    shell:
        """
        gatk CombineGVCFs -R {input.ref} {params.gvcf_args} \
          -O {output}
        """

rule genotype_gvcfs:
    input:
        gvcf="results/genotyped/combined.g.vcf.gz",
        ref=REF
    output:
        "results/genotyped/all_samples.vcf.gz"
    conda: "envs/gatk.yaml"
    shell:
        """
        gatk GenotypeGVCFs -R {input.ref} -V {input.gvcf} \
          -O {output}
        """

rule vqsr_snps:
    input:
        vcf="results/genotyped/all_samples.vcf.gz",
        ref=REF
    output:
        recal="results/vqsr/snps.recal",
        tranches="results/vqsr/snps.tranches",
        vcf="results/vqsr/snps_recalibrated.vcf.gz"
    params:
        hapmap=config["hapmap"],
        omni=config["omni"],
        g1k=config["g1k_snps"],
        dbsnp=config["dbsnp"]
    conda: "envs/gatk.yaml"
    shell:
        """
        gatk VariantRecalibrator -R {input.ref} -V {input.vcf} \
          --resource:hapmap,known=false,training=true,truth=true,prior=15.0 {params.hapmap} \
          --resource:omni,known=false,training=true,truth=false,prior=12.0 {params.omni} \
          --resource:1000G,known=false,training=true,truth=false,prior=10.0 {params.g1k} \
          --resource:dbsnp,known=true,training=false,truth=false,prior=2.0 {params.dbsnp} \
          -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \
          -mode SNP -O {output.recal} --tranches-file {output.tranches}

        gatk ApplyVQSR -R {input.ref} -V {input.vcf} \
          --recal-file {output.recal} --tranches-file {output.tranches} \
          --truth-sensitivity-filter-level 99.5 -mode SNP \
          -O {output.vcf}
        """

rule vep_annotate:
    input:
        "results/vqsr/snps_recalibrated.vcf.gz"
    output:
        "results/annotation/all_samples.annotated.vcf"
    threads: 4
    conda: "envs/annotation.yaml"
    shell:
        """
        vep --input_file {input} --output_file {output} \
          --format vcf --vcf --fork {threads} \
          --cache --offline --assembly GRCh38 \
          --sift b --polyphen b --symbol --numbers \
          --canonical --biotype --af --af_1kg --af_gnomade
        """

rule multiqc:
    input:
        expand("results/trimmed/{sample}_fastp.json", sample=SAMPLES),
        expand("results/dedup/{sample}.dup_metrics.txt", sample=SAMPLES)
    output:
        "results/multiqc/multiqc_report.html"
    conda: "envs/qc.yaml"
    shell:
        "multiqc results/ -o results/multiqc/ --force"

#!/usr/bin/env nextflow
nextflow.enable.dsl = 2

params.reads     = "data/*_{R1,R2}.fastq.gz"
params.ref       = "ref/GRCh38.fa"
params.outdir    = "results"
params.dbsnp     = "ref/dbsnp_156.vcf.gz"

process FASTP {
    tag "${sample_id}"
    publishDir "${params.outdir}/trimmed", mode: 'copy'
    cpus 4
    input:  tuple val(sample_id), path(reads)
    output:
        tuple val(sample_id), path("*_trimmed.fastq.gz"), emit: trimmed
        path("*.json"), emit: json
    script:
    def (r1, r2) = reads
    """
    fastp -i ${r1} -I ${r2} \
      -o ${sample_id}_R1_trimmed.fastq.gz \
      -O ${sample_id}_R2_trimmed.fastq.gz \
      -j ${sample_id}_fastp.json \
      --detect_adapter_for_pe --thread ${task.cpus}
    """
}

process BWA_MEM2 {
    tag "${sample_id}"
    publishDir "${params.outdir}/aligned", mode: 'copy'
    cpus 16
    memory '32 GB'
    input:
        tuple val(sample_id), path(reads)
        path(ref)
    output:
        tuple val(sample_id), path("*.sorted.bam"), path("*.sorted.bam.bai"), emit: bam
    script:
    def (r1, r2) = reads
    """
    bwa-mem2 mem -t ${task.cpus} \
      -R "@RG\\tID:${sample_id}\\tSM:${sample_id}\\tPL:ILLUMINA" \
      ${ref} ${r1} ${r2} \
      | samtools sort -@ 4 -o ${sample_id}.sorted.bam -
    samtools index ${sample_id}.sorted.bam
    """
}

process MARKDUP {
    tag "${sample_id}"
    publishDir "${params.outdir}/dedup", mode: 'copy'
    cpus 8
    input:  tuple val(sample_id), path(bam), path(bai)
    output: tuple val(sample_id), path("*.dedup.bam"), path("*.dedup.bam.bai"), emit: bam
    script:
    """
    sambamba markdup -t ${task.cpus} ${bam} ${sample_id}.dedup.bam
    samtools index ${sample_id}.dedup.bam
    """
}

process HAPLOTYPE_CALLER {
    tag "${sample_id}"
    publishDir "${params.outdir}/gvcf", mode: 'copy'
    cpus 4
    memory '16 GB'
    input:
        tuple val(sample_id), path(bam), path(bai)
        path(ref)
    output: tuple val(sample_id), path("*.g.vcf.gz"), path("*.g.vcf.gz.tbi"), emit: gvcf
    script:
    """
    gatk HaplotypeCaller -R ${ref} -I ${bam} \
      -O ${sample_id}.g.vcf.gz -ERC GVCF \
      --native-pair-hmm-threads ${task.cpus}
    """
}

process JOINT_GENOTYPE {
    publishDir "${params.outdir}/genotyped", mode: 'copy'
    memory '32 GB'
    input:
        path(gvcfs)
        path(tbis)
        path(ref)
    output: path("all_samples.vcf.gz"), emit: vcf
    script:
    def gvcf_args = gvcfs.collect { "-V ${it}" }.join(' ')
    """
    gatk CombineGVCFs -R ${ref} ${gvcf_args} -O combined.g.vcf.gz
    gatk GenotypeGVCFs -R ${ref} -V combined.g.vcf.gz \
      -O all_samples.vcf.gz
    """
}

process VEP_ANNOTATE {
    publishDir "${params.outdir}/annotation", mode: 'copy'
    cpus 4
    input:  path(vcf)
    output: path("annotated.vcf")
    script:
    """
    vep --input_file ${vcf} --output_file annotated.vcf \
      --format vcf --vcf --fork ${task.cpus} \
      --cache --offline --assembly GRCh38 \
      --sift b --polyphen b --symbol --canonical --af_gnomade
    """
}

workflow {
    reads_ch = Channel.fromFilePairs(params.reads, checkIfExists: true)
    ref_ch   = Channel.value(file(params.ref))

    FASTP(reads_ch)
    BWA_MEM2(FASTP.out.trimmed, ref_ch)
    MARKDUP(BWA_MEM2.out.bam)
    HAPLOTYPE_CALLER(MARKDUP.out.bam, ref_ch)

    gvcfs = HAPLOTYPE_CALLER.out.gvcf.map { it[1] }.collect()
    tbis  = HAPLOTYPE_CALLER.out.gvcf.map { it[2] }.collect()

    JOINT_GENOTYPE(gvcfs, tbis, ref_ch)
    VEP_ANNOTATE(JOINT_GENOTYPE.out.vcf)
}

WGS Variant Calling

Overview

Pipeline Steps

Implementation

See Also