ATAC-seq
========

Overview
--------

Assay for Transposase-Accessible Chromatin using sequencing (ATAC-seq) maps
genome-wide chromatin accessibility. Tn5 transposase inserts sequencing adapters
into open chromatin regions, and the resulting fragments are sequenced. The
analysis pipeline trims reads, aligns to the reference genome, removes
duplicates and mitochondrial reads, shifts alignments to account for the Tn5
insertion offset, calls peaks representing accessible regions, and produces
signal tracks for visualisation in a genome browser.

Pipeline Steps
--------------

1. Trim adapters and low-quality bases with fastp.
2. Align reads to the reference genome using Bowtie2.
3. Filter out mitochondrial reads, low-quality alignments, and duplicates.
4. Apply the Tn5 offset shift (+4 bp/--5 bp) to aligned reads.
5. Call peaks with MACS2 in narrow-peak mode.
6. Generate normalised signal tracks (bigWig) with deepTools.
7. Produce a fragment-size distribution plot as a QC metric.
8. Aggregate QC metrics into a MultiQC report.

Implementation
--------------

.. tabs::

   .. tab:: Snakemake

      .. code-block:: python

         # Snakefile -- ATAC-seq Peak Calling Pipeline
         configfile: "config.yaml"

         SAMPLES   = config["samples"]
         REF       = config["bowtie2_index"]
         BLACKLIST = config["blacklist"]
         GENOME    = config.get("genome_size", "hs")

         rule all:
             input:
                 expand("results/peaks/{sample}_peaks.narrowPeak",
                        sample=SAMPLES),
                 expand("results/bigwig/{sample}.normalized.bw",
                        sample=SAMPLES),
                 "results/multiqc/multiqc_report.html"

         rule fastp_trim:
             input:
                 r1="data/{sample}_R1.fastq.gz",
                 r2="data/{sample}_R2.fastq.gz"
             output:
                 r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
                 r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
                 json="results/trimmed/{sample}_fastp.json"
             threads: 4
             conda: "envs/qc.yaml"
             shell:
                 """
                 fastp -i {input.r1} -I {input.r2} \
                   -o {output.r1} -O {output.r2} \
                   -j {output.json} \
                   --detect_adapter_for_pe --thread {threads}
                 """

         rule bowtie2_align:
             input:
                 r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
                 r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
                 index=REF
             output:
                 bam="results/aligned/{sample}.sorted.bam"
             threads: 16
             conda: "envs/align.yaml"
             shell:
                 """
                 bowtie2 -x {input.index} \
                   -1 {input.r1} -2 {input.r2} \
                   --very-sensitive -X 2000 --no-mixed --no-discordant \
                   --threads {threads} \
                   | samtools sort -@ 4 -o {output.bam} -
                 samtools index {output.bam}
                 """

         rule filter_and_dedup:
             input:
                 "results/aligned/{sample}.sorted.bam"
             output:
                 bam="results/filtered/{sample}.filtered.bam",
                 metrics="results/filtered/{sample}.dup_metrics.txt"
             threads: 8
             conda: "envs/align.yaml"
             shell:
                 """
                 # Remove mitochondrial, low-quality, and unmapped reads
                 samtools view -@ {threads} -h -q 30 -f 2 -F 1804 \
                   {input} | grep -v chrM \
                   | samtools sort -@ 4 -o results/filtered/{wildcards.sample}.tmp.bam -
                 # Mark and remove duplicates
                 sambamba markdup -t {threads} --remove-duplicates \
                   results/filtered/{wildcards.sample}.tmp.bam {output.bam} \
                   2> {output.metrics}
                 samtools index {output.bam}
                 rm -f results/filtered/{wildcards.sample}.tmp.bam
                 """

         rule tn5_shift:
             input:
                 "results/filtered/{sample}.filtered.bam"
             output:
                 "results/shifted/{sample}.shifted.bam"
             threads: 4
             conda: "envs/align.yaml"
             shell:
                 """
                 alignmentSieve -b {input} -o {output} \
                   --ATACshift --numberOfProcessors {threads}
                 samtools sort -@ {threads} -o {output} {output}
                 samtools index {output}
                 """

         rule macs2_callpeak:
             input:
                 "results/shifted/{sample}.shifted.bam"
             output:
                 peaks="results/peaks/{sample}_peaks.narrowPeak",
                 xls="results/peaks/{sample}_peaks.xls"
             params:
                 genome=GENOME,
                 name="{sample}"
             conda: "envs/peaks.yaml"
             shell:
                 """
                 macs2 callpeak -t {input} \
                   -f BAMPE -g {params.genome} \
                   -n {params.name} --outdir results/peaks/ \
                   --nomodel --shift -100 --extsize 200 \
                   --keep-dup all -q 0.05
                 """

         rule bigwig:
             input:
                 "results/shifted/{sample}.shifted.bam"
             output:
                 "results/bigwig/{sample}.normalized.bw"
             threads: 8
             conda: "envs/peaks.yaml"
             shell:
                 """
                 bamCoverage -b {input} -o {output} \
                   --normalizeUsing RPGC \
                   --effectiveGenomeSize 2913022398 \
                   --binSize 10 --numberOfProcessors {threads} \
                   --blackListFileName {BLACKLIST}
                 """

         rule multiqc:
             input:
                 expand("results/trimmed/{sample}_fastp.json", sample=SAMPLES),
                 expand("results/filtered/{sample}.dup_metrics.txt",
                        sample=SAMPLES),
                 expand("results/peaks/{sample}_peaks.xls", sample=SAMPLES)
             output:
                 "results/multiqc/multiqc_report.html"
             conda: "envs/qc.yaml"
             shell:
                 "multiqc results/ -o results/multiqc/ --force"

   .. tab:: Nextflow

      .. code-block:: groovy

         #!/usr/bin/env nextflow
         nextflow.enable.dsl = 2

         params.reads       = "data/*_{R1,R2}.fastq.gz"
         params.bt2_index   = "ref/bt2_index"
         params.blacklist   = "ref/blacklist.bed"
         params.genome_size = "hs"
         params.outdir      = "results"

         process FASTP {
             tag "${sample_id}"
             publishDir "${params.outdir}/trimmed", mode: 'copy'
             cpus 4
             input:  tuple val(sample_id), path(reads)
             output:
                 tuple val(sample_id), path("*_trimmed.fastq.gz"), emit: trimmed
                 path("*.json"), emit: json
             script:
             def (r1, r2) = reads
             """
             fastp -i ${r1} -I ${r2} \
               -o ${sample_id}_R1_trimmed.fastq.gz \
               -O ${sample_id}_R2_trimmed.fastq.gz \
               -j ${sample_id}_fastp.json \
               --detect_adapter_for_pe --thread ${task.cpus}
             """
         }

         process BOWTIE2_ALIGN {
             tag "${sample_id}"
             publishDir "${params.outdir}/aligned", mode: 'copy'
             cpus 16
             input:
                 tuple val(sample_id), path(reads)
                 path(index)
             output:
                 tuple val(sample_id), path("*.sorted.bam"), path("*.sorted.bam.bai"), emit: bam
             script:
             def (r1, r2) = reads
             """
             bowtie2 -x ${index}/${index.name} \
               -1 ${r1} -2 ${r2} \
               --very-sensitive -X 2000 --no-mixed --no-discordant \
               --threads ${task.cpus} \
               | samtools sort -@ 4 -o ${sample_id}.sorted.bam -
             samtools index ${sample_id}.sorted.bam
             """
         }

         process FILTER_DEDUP {
             tag "${sample_id}"
             publishDir "${params.outdir}/filtered", mode: 'copy'
             cpus 8
             input:  tuple val(sample_id), path(bam), path(bai)
             output: tuple val(sample_id), path("*.filtered.bam"), path("*.filtered.bam.bai"), emit: bam
             script:
             """
             samtools view -@ ${task.cpus} -h -q 30 -f 2 -F 1804 \
               ${bam} | grep -v chrM \
               | samtools sort -@ 4 -o tmp.bam -
             sambamba markdup -t ${task.cpus} --remove-duplicates \
               tmp.bam ${sample_id}.filtered.bam
             samtools index ${sample_id}.filtered.bam
             rm -f tmp.bam
             """
         }

         process TN5_SHIFT {
             tag "${sample_id}"
             publishDir "${params.outdir}/shifted", mode: 'copy'
             cpus 4
             input:  tuple val(sample_id), path(bam), path(bai)
             output: tuple val(sample_id), path("*.shifted.bam"), path("*.shifted.bam.bai"), emit: bam
             script:
             """
             alignmentSieve -b ${bam} -o ${sample_id}.shifted.bam \
               --ATACshift --numberOfProcessors ${task.cpus}
             samtools sort -@ ${task.cpus} -o ${sample_id}.shifted.sorted.bam \
               ${sample_id}.shifted.bam
             mv ${sample_id}.shifted.sorted.bam ${sample_id}.shifted.bam
             samtools index ${sample_id}.shifted.bam
             """
         }

         process MACS2_CALLPEAK {
             tag "${sample_id}"
             publishDir "${params.outdir}/peaks", mode: 'copy'
             input:  tuple val(sample_id), path(bam), path(bai)
             output: path("${sample_id}_peaks.narrowPeak"), emit: peaks
             script:
             """
             macs2 callpeak -t ${bam} \
               -f BAMPE -g ${params.genome_size} \
               -n ${sample_id} --outdir . \
               --nomodel --shift -100 --extsize 200 \
               --keep-dup all -q 0.05
             """
         }

         process BIGWIG {
             tag "${sample_id}"
             publishDir "${params.outdir}/bigwig", mode: 'copy'
             cpus 8
             input:
                 tuple val(sample_id), path(bam), path(bai)
                 path(blacklist)
             output: path("*.normalized.bw"), emit: bw
             script:
             """
             bamCoverage -b ${bam} -o ${sample_id}.normalized.bw \
               --normalizeUsing RPGC \
               --effectiveGenomeSize 2913022398 \
               --binSize 10 --numberOfProcessors ${task.cpus} \
               --blackListFileName ${blacklist}
             """
         }

         process MULTIQC {
             publishDir "${params.outdir}/multiqc", mode: 'copy'
             input:  path('*')
             output: path("multiqc_report.html")
             script:
             """
             multiqc . -o . --force
             """
         }

         workflow {
             reads_ch     = Channel.fromFilePairs(params.reads, checkIfExists: true)
             bt2_index_ch = Channel.value(file(params.bt2_index))
             blacklist_ch = Channel.value(file(params.blacklist))

             FASTP(reads_ch)
             BOWTIE2_ALIGN(FASTP.out.trimmed, bt2_index_ch)
             FILTER_DEDUP(BOWTIE2_ALIGN.out.bam)
             TN5_SHIFT(FILTER_DEDUP.out.bam)
             MACS2_CALLPEAK(TN5_SHIFT.out.bam)
             BIGWIG(TN5_SHIFT.out.bam, blacklist_ch)

             ch_multiqc = FASTP.out.json.collect()
             MULTIQC(ch_multiqc)
         }

See Also
--------

* :doc:`snakemake` -- Snakemake workflow manager overview
* :doc:`nextflow` -- Nextflow DSL2 workflow manager overview
* :doc:`chipseq` -- ChIP-seq workflow (related chromatin profiling assay)
* :doc:`/tools/alignment/bowtie2` -- Bowtie2 aligner reference
* :doc:`/tools/epigenomics/index` -- Epigenomics tools