ChIP-seq
========

Overview
--------

Chromatin Immunoprecipitation followed by sequencing (ChIP-seq) identifies
genome-wide binding sites of transcription factors, histone modifications, and
other chromatin-associated proteins. After cross-linking, fragmentation, and
immunoprecipitation with a target-specific antibody, enriched DNA fragments are
sequenced. The analysis pipeline trims reads, aligns them to the reference
genome, removes duplicates, calls peaks relative to an input control using
MACS2, and generates normalised signal tracks for genome browser visualisation.

Pipeline Steps
--------------

1. Trim adapters and low-quality bases with fastp.
2. Align reads to the reference genome using Bowtie2.
3. Filter low-quality alignments and remove PCR duplicates.
4. Call peaks with MACS2 using matched input controls.
5. Filter peaks against a blacklist of problematic regions.
6. Generate normalised signal tracks (bigWig) with deepTools.
7. Compute quality metrics including FRiP (Fraction of Reads in Peaks).
8. Aggregate QC metrics into a MultiQC report.

Implementation
--------------

.. tabs::

   .. tab:: Snakemake

      .. code-block:: python

         # Snakefile -- ChIP-seq Peak Calling Pipeline
         configfile: "config.yaml"

         SAMPLES   = config["samples"]     # ChIP samples
         INPUTS    = config["inputs"]      # input controls per sample
         REF       = config["bowtie2_index"]
         BLACKLIST = config["blacklist"]
         GENOME    = config.get("genome_size", "hs")

         rule all:
             input:
                 expand("results/peaks/{sample}_peaks.narrowPeak",
                        sample=SAMPLES),
                 expand("results/bigwig/{sample}.normalized.bw",
                        sample=SAMPLES),
                 "results/multiqc/multiqc_report.html"

         rule fastp_trim:
             input:
                 r1="data/{sample}_R1.fastq.gz",
                 r2="data/{sample}_R2.fastq.gz"
             output:
                 r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
                 r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
                 json="results/trimmed/{sample}_fastp.json"
             threads: 4
             conda: "envs/qc.yaml"
             shell:
                 """
                 fastp -i {input.r1} -I {input.r2} \
                   -o {output.r1} -O {output.r2} \
                   -j {output.json} \
                   --detect_adapter_for_pe --thread {threads}
                 """

         rule bowtie2_align:
             input:
                 r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
                 r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
                 index=REF
             output:
                 bam="results/aligned/{sample}.sorted.bam"
             threads: 16
             conda: "envs/align.yaml"
             shell:
                 """
                 bowtie2 -x {input.index} \
                   -1 {input.r1} -2 {input.r2} \
                   --very-sensitive -X 1000 \
                   --threads {threads} \
                   | samtools sort -@ 4 -o {output.bam} -
                 samtools index {output.bam}
                 """

         rule filter_and_dedup:
             input:
                 "results/aligned/{sample}.sorted.bam"
             output:
                 bam="results/filtered/{sample}.filtered.bam",
                 metrics="results/filtered/{sample}.dup_metrics.txt"
             threads: 8
             conda: "envs/align.yaml"
             shell:
                 """
                 samtools view -@ {threads} -h -q 30 -f 2 -F 1804 \
                   {input} \
                   | samtools sort -@ 4 -o results/filtered/{wildcards.sample}.tmp.bam -
                 sambamba markdup -t {threads} --remove-duplicates \
                   results/filtered/{wildcards.sample}.tmp.bam {output.bam} \
                   2> {output.metrics}
                 samtools index {output.bam}
                 rm -f results/filtered/{wildcards.sample}.tmp.bam
                 """

         rule macs2_callpeak:
             input:
                 treatment="results/filtered/{sample}.filtered.bam",
                 control=lambda wc: f"results/filtered/{INPUTS[wc.sample]}.filtered.bam"
             output:
                 peaks="results/peaks/{sample}_peaks.narrowPeak",
                 xls="results/peaks/{sample}_peaks.xls"
             params:
                 genome=GENOME,
                 name="{sample}"
             conda: "envs/peaks.yaml"
             shell:
                 """
                 macs2 callpeak \
                   -t {input.treatment} -c {input.control} \
                   -f BAMPE -g {params.genome} \
                   -n {params.name} --outdir results/peaks/ \
                   -q 0.05
                 """

         rule filter_blacklist:
             input:
                 peaks="results/peaks/{sample}_peaks.narrowPeak",
                 blacklist=BLACKLIST
             output:
                 "results/peaks/{sample}_peaks.filtered.narrowPeak"
             conda: "envs/peaks.yaml"
             shell:
                 """
                 bedtools intersect -v \
                   -a {input.peaks} -b {input.blacklist} \
                   > {output}
                 """

         rule bigwig:
             input:
                 "results/filtered/{sample}.filtered.bam"
             output:
                 "results/bigwig/{sample}.normalized.bw"
             threads: 8
             conda: "envs/peaks.yaml"
             shell:
                 """
                 bamCoverage -b {input} -o {output} \
                   --normalizeUsing RPGC \
                   --effectiveGenomeSize 2913022398 \
                   --binSize 10 --numberOfProcessors {threads} \
                   --extendReads 200
                 """

         rule frip_score:
             input:
                 bam="results/filtered/{sample}.filtered.bam",
                 peaks="results/peaks/{sample}_peaks.filtered.narrowPeak"
             output:
                 "results/qc/{sample}_frip.txt"
             conda: "envs/peaks.yaml"
             shell:
                 """
                 total=$(samtools view -c {input.bam})
                 in_peaks=$(bedtools intersect -a {input.bam} -b {input.peaks} \
                   -u -ubam | samtools view -c)
                 echo -e "{wildcards.sample}\t${{in_peaks}}\t${{total}}" > {output}
                 """

         rule multiqc:
             input:
                 expand("results/trimmed/{sample}_fastp.json",
                        sample=list(SAMPLES) + list(INPUTS.values())),
                 expand("results/filtered/{sample}.dup_metrics.txt",
                        sample=list(SAMPLES) + list(INPUTS.values())),
                 expand("results/peaks/{sample}_peaks.xls", sample=SAMPLES)
             output:
                 "results/multiqc/multiqc_report.html"
             conda: "envs/qc.yaml"
             shell:
                 "multiqc results/ -o results/multiqc/ --force"

   .. tab:: Nextflow

      .. code-block:: groovy

         #!/usr/bin/env nextflow
         nextflow.enable.dsl = 2

         params.reads       = "data/*_{R1,R2}.fastq.gz"
         params.bt2_index   = "ref/bt2_index"
         params.blacklist   = "ref/blacklist.bed"
         params.genome_size = "hs"
         params.outdir      = "results"
         params.input_csv   = "samplesheet.csv"  // sample_id,fastq_1,fastq_2,control_id

         process FASTP {
             tag "${sample_id}"
             publishDir "${params.outdir}/trimmed", mode: 'copy'
             cpus 4
             input:  tuple val(sample_id), path(reads)
             output:
                 tuple val(sample_id), path("*_trimmed.fastq.gz"), emit: trimmed
                 path("*.json"), emit: json
             script:
             def (r1, r2) = reads
             """
             fastp -i ${r1} -I ${r2} \
               -o ${sample_id}_R1_trimmed.fastq.gz \
               -O ${sample_id}_R2_trimmed.fastq.gz \
               -j ${sample_id}_fastp.json \
               --detect_adapter_for_pe --thread ${task.cpus}
             """
         }

         process BOWTIE2_ALIGN {
             tag "${sample_id}"
             publishDir "${params.outdir}/aligned", mode: 'copy'
             cpus 16
             input:
                 tuple val(sample_id), path(reads)
                 path(index)
             output:
                 tuple val(sample_id), path("*.sorted.bam"), path("*.sorted.bam.bai"), emit: bam
             script:
             def (r1, r2) = reads
             """
             bowtie2 -x ${index}/${index.name} \
               -1 ${r1} -2 ${r2} \
               --very-sensitive -X 1000 \
               --threads ${task.cpus} \
               | samtools sort -@ 4 -o ${sample_id}.sorted.bam -
             samtools index ${sample_id}.sorted.bam
             """
         }

         process FILTER_DEDUP {
             tag "${sample_id}"
             publishDir "${params.outdir}/filtered", mode: 'copy'
             cpus 8
             input:  tuple val(sample_id), path(bam), path(bai)
             output: tuple val(sample_id), path("*.filtered.bam"), path("*.filtered.bam.bai"), emit: bam
             script:
             """
             samtools view -@ ${task.cpus} -h -q 30 -f 2 -F 1804 \
               ${bam} \
               | samtools sort -@ 4 -o tmp.bam -
             sambamba markdup -t ${task.cpus} --remove-duplicates \
               tmp.bam ${sample_id}.filtered.bam
             samtools index ${sample_id}.filtered.bam
             rm -f tmp.bam
             """
         }

         process MACS2_CALLPEAK {
             tag "${sample_id}"
             publishDir "${params.outdir}/peaks", mode: 'copy'
             input:
                 tuple val(sample_id), path(treatment_bam), path(treatment_bai)
                 tuple val(control_id), path(control_bam), path(control_bai)
             output:
                 tuple val(sample_id), path("${sample_id}_peaks.narrowPeak"), emit: peaks
             script:
             """
             macs2 callpeak \
               -t ${treatment_bam} -c ${control_bam} \
               -f BAMPE -g ${params.genome_size} \
               -n ${sample_id} --outdir . \
               -q 0.05
             """
         }

         process FILTER_BLACKLIST {
             tag "${sample_id}"
             publishDir "${params.outdir}/peaks", mode: 'copy'
             input:
                 tuple val(sample_id), path(peaks)
                 path(blacklist)
             output:
                 tuple val(sample_id), path("*_filtered.narrowPeak"), emit: peaks
             script:
             """
             bedtools intersect -v \
               -a ${peaks} -b ${blacklist} \
               > ${sample_id}_filtered.narrowPeak
             """
         }

         process BIGWIG {
             tag "${sample_id}"
             publishDir "${params.outdir}/bigwig", mode: 'copy'
             cpus 8
             input:  tuple val(sample_id), path(bam), path(bai)
             output: path("*.normalized.bw"), emit: bw
             script:
             """
             bamCoverage -b ${bam} -o ${sample_id}.normalized.bw \
               --normalizeUsing RPGC \
               --effectiveGenomeSize 2913022398 \
               --binSize 10 --numberOfProcessors ${task.cpus} \
               --extendReads 200
             """
         }

         process MULTIQC {
             publishDir "${params.outdir}/multiqc", mode: 'copy'
             input:  path('*')
             output: path("multiqc_report.html")
             script:
             """
             multiqc . -o . --force
             """
         }

         workflow {
             reads_ch     = Channel.fromFilePairs(params.reads, checkIfExists: true)
             bt2_index_ch = Channel.value(file(params.bt2_index))
             blacklist_ch = Channel.value(file(params.blacklist))

             // Parse samplesheet to get treatment-control pairs
             samplesheet = Channel.fromPath(params.input_csv)
                 .splitCsv(header: true)
                 .map { row -> [row.sample_id, row.control_id] }

             FASTP(reads_ch)
             BOWTIE2_ALIGN(FASTP.out.trimmed, bt2_index_ch)
             FILTER_DEDUP(BOWTIE2_ALIGN.out.bam)

             // Pair treatment with control BAMs
             treatment_ch = FILTER_DEDUP.out.bam
                 .join(samplesheet)
             control_bams = FILTER_DEDUP.out.bam
             paired_ch = treatment_ch
                 .map { sample_id, bam, bai, control_id -> [control_id, sample_id, bam, bai] }
                 .combine(control_bams, by: 0)
                 .map { control_id, sample_id, treat_bam, treat_bai, ctrl_bam, ctrl_bai ->
                     [[sample_id, treat_bam, treat_bai], [control_id, ctrl_bam, ctrl_bai]]
                 }

             MACS2_CALLPEAK(
                 paired_ch.map { it[0] },
                 paired_ch.map { it[1] }
             )
             FILTER_BLACKLIST(MACS2_CALLPEAK.out.peaks, blacklist_ch)
             BIGWIG(FILTER_DEDUP.out.bam)

             ch_multiqc = FASTP.out.json.collect()
             MULTIQC(ch_multiqc)
         }

See Also
--------

* :doc:`snakemake` -- Snakemake workflow manager overview
* :doc:`nextflow` -- Nextflow DSL2 workflow manager overview
* :doc:`atacseq` -- ATAC-seq workflow (related chromatin profiling assay)
* :doc:`/tools/alignment/bowtie2` -- Bowtie2 aligner reference
* :doc:`/tools/epigenomics/index` -- Epigenomics tools