ChIP-seq
Overview
Chromatin Immunoprecipitation followed by sequencing (ChIP-seq) identifies genome-wide binding sites of transcription factors, histone modifications, and other chromatin-associated proteins. After cross-linking, fragmentation, and immunoprecipitation with a target-specific antibody, enriched DNA fragments are sequenced. The analysis pipeline trims reads, aligns them to the reference genome, removes duplicates, calls peaks relative to an input control using MACS2, and generates normalised signal tracks for genome browser visualisation.
Pipeline Steps
Trim adapters and low-quality bases with fastp.
Align reads to the reference genome using Bowtie2.
Filter low-quality alignments and remove PCR duplicates.
Call peaks with MACS2 using matched input controls.
Filter peaks against a blacklist of problematic regions.
Generate normalised signal tracks (bigWig) with deepTools.
Compute quality metrics including FRiP (Fraction of Reads in Peaks).
Aggregate QC metrics into a MultiQC report.
Implementation
# Snakefile -- ChIP-seq Peak Calling Pipeline
configfile: "config.yaml"
SAMPLES = config["samples"] # ChIP samples
INPUTS = config["inputs"] # input controls per sample
REF = config["bowtie2_index"]
BLACKLIST = config["blacklist"]
GENOME = config.get("genome_size", "hs")
rule all:
input:
expand("results/peaks/{sample}_peaks.narrowPeak",
sample=SAMPLES),
expand("results/bigwig/{sample}.normalized.bw",
sample=SAMPLES),
"results/multiqc/multiqc_report.html"
rule fastp_trim:
input:
r1="data/{sample}_R1.fastq.gz",
r2="data/{sample}_R2.fastq.gz"
output:
r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
json="results/trimmed/{sample}_fastp.json"
threads: 4
conda: "envs/qc.yaml"
shell:
"""
fastp -i {input.r1} -I {input.r2} \
-o {output.r1} -O {output.r2} \
-j {output.json} \
--detect_adapter_for_pe --thread {threads}
"""
rule bowtie2_align:
input:
r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
index=REF
output:
bam="results/aligned/{sample}.sorted.bam"
threads: 16
conda: "envs/align.yaml"
shell:
"""
bowtie2 -x {input.index} \
-1 {input.r1} -2 {input.r2} \
--very-sensitive -X 1000 \
--threads {threads} \
| samtools sort -@ 4 -o {output.bam} -
samtools index {output.bam}
"""
rule filter_and_dedup:
input:
"results/aligned/{sample}.sorted.bam"
output:
bam="results/filtered/{sample}.filtered.bam",
metrics="results/filtered/{sample}.dup_metrics.txt"
threads: 8
conda: "envs/align.yaml"
shell:
"""
samtools view -@ {threads} -h -q 30 -f 2 -F 1804 \
{input} \
| samtools sort -@ 4 -o results/filtered/{wildcards.sample}.tmp.bam -
sambamba markdup -t {threads} --remove-duplicates \
results/filtered/{wildcards.sample}.tmp.bam {output.bam} \
2> {output.metrics}
samtools index {output.bam}
rm -f results/filtered/{wildcards.sample}.tmp.bam
"""
rule macs2_callpeak:
input:
treatment="results/filtered/{sample}.filtered.bam",
control=lambda wc: f"results/filtered/{INPUTS[wc.sample]}.filtered.bam"
output:
peaks="results/peaks/{sample}_peaks.narrowPeak",
xls="results/peaks/{sample}_peaks.xls"
params:
genome=GENOME,
name="{sample}"
conda: "envs/peaks.yaml"
shell:
"""
macs2 callpeak \
-t {input.treatment} -c {input.control} \
-f BAMPE -g {params.genome} \
-n {params.name} --outdir results/peaks/ \
-q 0.05
"""
rule filter_blacklist:
input:
peaks="results/peaks/{sample}_peaks.narrowPeak",
blacklist=BLACKLIST
output:
"results/peaks/{sample}_peaks.filtered.narrowPeak"
conda: "envs/peaks.yaml"
shell:
"""
bedtools intersect -v \
-a {input.peaks} -b {input.blacklist} \
> {output}
"""
rule bigwig:
input:
"results/filtered/{sample}.filtered.bam"
output:
"results/bigwig/{sample}.normalized.bw"
threads: 8
conda: "envs/peaks.yaml"
shell:
"""
bamCoverage -b {input} -o {output} \
--normalizeUsing RPGC \
--effectiveGenomeSize 2913022398 \
--binSize 10 --numberOfProcessors {threads} \
--extendReads 200
"""
rule frip_score:
input:
bam="results/filtered/{sample}.filtered.bam",
peaks="results/peaks/{sample}_peaks.filtered.narrowPeak"
output:
"results/qc/{sample}_frip.txt"
conda: "envs/peaks.yaml"
shell:
"""
total=$(samtools view -c {input.bam})
in_peaks=$(bedtools intersect -a {input.bam} -b {input.peaks} \
-u -ubam | samtools view -c)
echo -e "{wildcards.sample}\t${{in_peaks}}\t${{total}}" > {output}
"""
rule multiqc:
input:
expand("results/trimmed/{sample}_fastp.json",
sample=list(SAMPLES) + list(INPUTS.values())),
expand("results/filtered/{sample}.dup_metrics.txt",
sample=list(SAMPLES) + list(INPUTS.values())),
expand("results/peaks/{sample}_peaks.xls", sample=SAMPLES)
output:
"results/multiqc/multiqc_report.html"
conda: "envs/qc.yaml"
shell:
"multiqc results/ -o results/multiqc/ --force"
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
params.reads = "data/*_{R1,R2}.fastq.gz"
params.bt2_index = "ref/bt2_index"
params.blacklist = "ref/blacklist.bed"
params.genome_size = "hs"
params.outdir = "results"
params.input_csv = "samplesheet.csv" // sample_id,fastq_1,fastq_2,control_id
process FASTP {
tag "${sample_id}"
publishDir "${params.outdir}/trimmed", mode: 'copy'
cpus 4
input: tuple val(sample_id), path(reads)
output:
tuple val(sample_id), path("*_trimmed.fastq.gz"), emit: trimmed
path("*.json"), emit: json
script:
def (r1, r2) = reads
"""
fastp -i ${r1} -I ${r2} \
-o ${sample_id}_R1_trimmed.fastq.gz \
-O ${sample_id}_R2_trimmed.fastq.gz \
-j ${sample_id}_fastp.json \
--detect_adapter_for_pe --thread ${task.cpus}
"""
}
process BOWTIE2_ALIGN {
tag "${sample_id}"
publishDir "${params.outdir}/aligned", mode: 'copy'
cpus 16
input:
tuple val(sample_id), path(reads)
path(index)
output:
tuple val(sample_id), path("*.sorted.bam"), path("*.sorted.bam.bai"), emit: bam
script:
def (r1, r2) = reads
"""
bowtie2 -x ${index}/${index.name} \
-1 ${r1} -2 ${r2} \
--very-sensitive -X 1000 \
--threads ${task.cpus} \
| samtools sort -@ 4 -o ${sample_id}.sorted.bam -
samtools index ${sample_id}.sorted.bam
"""
}
process FILTER_DEDUP {
tag "${sample_id}"
publishDir "${params.outdir}/filtered", mode: 'copy'
cpus 8
input: tuple val(sample_id), path(bam), path(bai)
output: tuple val(sample_id), path("*.filtered.bam"), path("*.filtered.bam.bai"), emit: bam
script:
"""
samtools view -@ ${task.cpus} -h -q 30 -f 2 -F 1804 \
${bam} \
| samtools sort -@ 4 -o tmp.bam -
sambamba markdup -t ${task.cpus} --remove-duplicates \
tmp.bam ${sample_id}.filtered.bam
samtools index ${sample_id}.filtered.bam
rm -f tmp.bam
"""
}
process MACS2_CALLPEAK {
tag "${sample_id}"
publishDir "${params.outdir}/peaks", mode: 'copy'
input:
tuple val(sample_id), path(treatment_bam), path(treatment_bai)
tuple val(control_id), path(control_bam), path(control_bai)
output:
tuple val(sample_id), path("${sample_id}_peaks.narrowPeak"), emit: peaks
script:
"""
macs2 callpeak \
-t ${treatment_bam} -c ${control_bam} \
-f BAMPE -g ${params.genome_size} \
-n ${sample_id} --outdir . \
-q 0.05
"""
}
process FILTER_BLACKLIST {
tag "${sample_id}"
publishDir "${params.outdir}/peaks", mode: 'copy'
input:
tuple val(sample_id), path(peaks)
path(blacklist)
output:
tuple val(sample_id), path("*_filtered.narrowPeak"), emit: peaks
script:
"""
bedtools intersect -v \
-a ${peaks} -b ${blacklist} \
> ${sample_id}_filtered.narrowPeak
"""
}
process BIGWIG {
tag "${sample_id}"
publishDir "${params.outdir}/bigwig", mode: 'copy'
cpus 8
input: tuple val(sample_id), path(bam), path(bai)
output: path("*.normalized.bw"), emit: bw
script:
"""
bamCoverage -b ${bam} -o ${sample_id}.normalized.bw \
--normalizeUsing RPGC \
--effectiveGenomeSize 2913022398 \
--binSize 10 --numberOfProcessors ${task.cpus} \
--extendReads 200
"""
}
process MULTIQC {
publishDir "${params.outdir}/multiqc", mode: 'copy'
input: path('*')
output: path("multiqc_report.html")
script:
"""
multiqc . -o . --force
"""
}
workflow {
reads_ch = Channel.fromFilePairs(params.reads, checkIfExists: true)
bt2_index_ch = Channel.value(file(params.bt2_index))
blacklist_ch = Channel.value(file(params.blacklist))
// Parse samplesheet to get treatment-control pairs
samplesheet = Channel.fromPath(params.input_csv)
.splitCsv(header: true)
.map { row -> [row.sample_id, row.control_id] }
FASTP(reads_ch)
BOWTIE2_ALIGN(FASTP.out.trimmed, bt2_index_ch)
FILTER_DEDUP(BOWTIE2_ALIGN.out.bam)
// Pair treatment with control BAMs
treatment_ch = FILTER_DEDUP.out.bam
.join(samplesheet)
control_bams = FILTER_DEDUP.out.bam
paired_ch = treatment_ch
.map { sample_id, bam, bai, control_id -> [control_id, sample_id, bam, bai] }
.combine(control_bams, by: 0)
.map { control_id, sample_id, treat_bam, treat_bai, ctrl_bam, ctrl_bai ->
[[sample_id, treat_bam, treat_bai], [control_id, ctrl_bam, ctrl_bai]]
}
MACS2_CALLPEAK(
paired_ch.map { it[0] },
paired_ch.map { it[1] }
)
FILTER_BLACKLIST(MACS2_CALLPEAK.out.peaks, blacklist_ch)
BIGWIG(FILTER_DEDUP.out.bam)
ch_multiqc = FASTP.out.json.collect()
MULTIQC(ch_multiqc)
}
See Also
Snakemake – Snakemake workflow manager overview
Nextflow – Nextflow DSL2 workflow manager overview
ATAC-seq – ATAC-seq workflow (related chromatin profiling assay)
Bowtie2 – Bowtie2 aligner reference
Epigenomics – Epigenomics tools