ChIP-seq

Overview

Chromatin Immunoprecipitation followed by sequencing (ChIP-seq) identifies genome-wide binding sites of transcription factors, histone modifications, and other chromatin-associated proteins. After cross-linking, fragmentation, and immunoprecipitation with a target-specific antibody, enriched DNA fragments are sequenced. The analysis pipeline trims reads, aligns them to the reference genome, removes duplicates, calls peaks relative to an input control using MACS2, and generates normalised signal tracks for genome browser visualisation.

Pipeline Steps

Trim adapters and low-quality bases with fastp.
Align reads to the reference genome using Bowtie2.
Filter low-quality alignments and remove PCR duplicates.
Call peaks with MACS2 using matched input controls.
Filter peaks against a blacklist of problematic regions.
Generate normalised signal tracks (bigWig) with deepTools.
Compute quality metrics including FRiP (Fraction of Reads in Peaks).
Aggregate QC metrics into a MultiQC report.

Implementation

# Snakefile -- ChIP-seq Peak Calling Pipeline
configfile: "config.yaml"

SAMPLES   = config["samples"]     # ChIP samples
INPUTS    = config["inputs"]      # input controls per sample
REF       = config["bowtie2_index"]
BLACKLIST = config["blacklist"]
GENOME    = config.get("genome_size", "hs")

rule all:
    input:
        expand("results/peaks/{sample}_peaks.narrowPeak",
               sample=SAMPLES),
        expand("results/bigwig/{sample}.normalized.bw",
               sample=SAMPLES),
        "results/multiqc/multiqc_report.html"

rule fastp_trim:
    input:
        r1="data/{sample}_R1.fastq.gz",
        r2="data/{sample}_R2.fastq.gz"
    output:
        r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
        r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
        json="results/trimmed/{sample}_fastp.json"
    threads: 4
    conda: "envs/qc.yaml"
    shell:
        """
        fastp -i {input.r1} -I {input.r2} \
          -o {output.r1} -O {output.r2} \
          -j {output.json} \
          --detect_adapter_for_pe --thread {threads}
        """

rule bowtie2_align:
    input:
        r1="results/trimmed/{sample}_R1.trimmed.fastq.gz",
        r2="results/trimmed/{sample}_R2.trimmed.fastq.gz",
        index=REF
    output:
        bam="results/aligned/{sample}.sorted.bam"
    threads: 16
    conda: "envs/align.yaml"
    shell:
        """
        bowtie2 -x {input.index} \
          -1 {input.r1} -2 {input.r2} \
          --very-sensitive -X 1000 \
          --threads {threads} \
          | samtools sort -@ 4 -o {output.bam} -
        samtools index {output.bam}
        """

rule filter_and_dedup:
    input:
        "results/aligned/{sample}.sorted.bam"
    output:
        bam="results/filtered/{sample}.filtered.bam",
        metrics="results/filtered/{sample}.dup_metrics.txt"
    threads: 8
    conda: "envs/align.yaml"
    shell:
        """
        samtools view -@ {threads} -h -q 30 -f 2 -F 1804 \
          {input} \
          | samtools sort -@ 4 -o results/filtered/{wildcards.sample}.tmp.bam -
        sambamba markdup -t {threads} --remove-duplicates \
          results/filtered/{wildcards.sample}.tmp.bam {output.bam} \
          2> {output.metrics}
        samtools index {output.bam}
        rm -f results/filtered/{wildcards.sample}.tmp.bam
        """

rule macs2_callpeak:
    input:
        treatment="results/filtered/{sample}.filtered.bam",
        control=lambda wc: f"results/filtered/{INPUTS[wc.sample]}.filtered.bam"
    output:
        peaks="results/peaks/{sample}_peaks.narrowPeak",
        xls="results/peaks/{sample}_peaks.xls"
    params:
        genome=GENOME,
        name="{sample}"
    conda: "envs/peaks.yaml"
    shell:
        """
        macs2 callpeak \
          -t {input.treatment} -c {input.control} \
          -f BAMPE -g {params.genome} \
          -n {params.name} --outdir results/peaks/ \
          -q 0.05
        """

rule filter_blacklist:
    input:
        peaks="results/peaks/{sample}_peaks.narrowPeak",
        blacklist=BLACKLIST
    output:
        "results/peaks/{sample}_peaks.filtered.narrowPeak"
    conda: "envs/peaks.yaml"
    shell:
        """
        bedtools intersect -v \
          -a {input.peaks} -b {input.blacklist} \
          > {output}
        """

rule bigwig:
    input:
        "results/filtered/{sample}.filtered.bam"
    output:
        "results/bigwig/{sample}.normalized.bw"
    threads: 8
    conda: "envs/peaks.yaml"
    shell:
        """
        bamCoverage -b {input} -o {output} \
          --normalizeUsing RPGC \
          --effectiveGenomeSize 2913022398 \
          --binSize 10 --numberOfProcessors {threads} \
          --extendReads 200
        """

rule frip_score:
    input:
        bam="results/filtered/{sample}.filtered.bam",
        peaks="results/peaks/{sample}_peaks.filtered.narrowPeak"
    output:
        "results/qc/{sample}_frip.txt"
    conda: "envs/peaks.yaml"
    shell:
        """
        total=$(samtools view -c {input.bam})
        in_peaks=$(bedtools intersect -a {input.bam} -b {input.peaks} \
          -u -ubam | samtools view -c)
        echo -e "{wildcards.sample}\t${{in_peaks}}\t${{total}}" > {output}
        """

rule multiqc:
    input:
        expand("results/trimmed/{sample}_fastp.json",
               sample=list(SAMPLES) + list(INPUTS.values())),
        expand("results/filtered/{sample}.dup_metrics.txt",
               sample=list(SAMPLES) + list(INPUTS.values())),
        expand("results/peaks/{sample}_peaks.xls", sample=SAMPLES)
    output:
        "results/multiqc/multiqc_report.html"
    conda: "envs/qc.yaml"
    shell:
        "multiqc results/ -o results/multiqc/ --force"

#!/usr/bin/env nextflow
nextflow.enable.dsl = 2

params.reads       = "data/*_{R1,R2}.fastq.gz"
params.bt2_index   = "ref/bt2_index"
params.blacklist   = "ref/blacklist.bed"
params.genome_size = "hs"
params.outdir      = "results"
params.input_csv   = "samplesheet.csv"  // sample_id,fastq_1,fastq_2,control_id

process FASTP {
    tag "${sample_id}"
    publishDir "${params.outdir}/trimmed", mode: 'copy'
    cpus 4
    input:  tuple val(sample_id), path(reads)
    output:
        tuple val(sample_id), path("*_trimmed.fastq.gz"), emit: trimmed
        path("*.json"), emit: json
    script:
    def (r1, r2) = reads
    """
    fastp -i ${r1} -I ${r2} \
      -o ${sample_id}_R1_trimmed.fastq.gz \
      -O ${sample_id}_R2_trimmed.fastq.gz \
      -j ${sample_id}_fastp.json \
      --detect_adapter_for_pe --thread ${task.cpus}
    """
}

process BOWTIE2_ALIGN {
    tag "${sample_id}"
    publishDir "${params.outdir}/aligned", mode: 'copy'
    cpus 16
    input:
        tuple val(sample_id), path(reads)
        path(index)
    output:
        tuple val(sample_id), path("*.sorted.bam"), path("*.sorted.bam.bai"), emit: bam
    script:
    def (r1, r2) = reads
    """
    bowtie2 -x ${index}/${index.name} \
      -1 ${r1} -2 ${r2} \
      --very-sensitive -X 1000 \
      --threads ${task.cpus} \
      | samtools sort -@ 4 -o ${sample_id}.sorted.bam -
    samtools index ${sample_id}.sorted.bam
    """
}

process FILTER_DEDUP {
    tag "${sample_id}"
    publishDir "${params.outdir}/filtered", mode: 'copy'
    cpus 8
    input:  tuple val(sample_id), path(bam), path(bai)
    output: tuple val(sample_id), path("*.filtered.bam"), path("*.filtered.bam.bai"), emit: bam
    script:
    """
    samtools view -@ ${task.cpus} -h -q 30 -f 2 -F 1804 \
      ${bam} \
      | samtools sort -@ 4 -o tmp.bam -
    sambamba markdup -t ${task.cpus} --remove-duplicates \
      tmp.bam ${sample_id}.filtered.bam
    samtools index ${sample_id}.filtered.bam
    rm -f tmp.bam
    """
}

process MACS2_CALLPEAK {
    tag "${sample_id}"
    publishDir "${params.outdir}/peaks", mode: 'copy'
    input:
        tuple val(sample_id), path(treatment_bam), path(treatment_bai)
        tuple val(control_id), path(control_bam), path(control_bai)
    output:
        tuple val(sample_id), path("${sample_id}_peaks.narrowPeak"), emit: peaks
    script:
    """
    macs2 callpeak \
      -t ${treatment_bam} -c ${control_bam} \
      -f BAMPE -g ${params.genome_size} \
      -n ${sample_id} --outdir . \
      -q 0.05
    """
}

process FILTER_BLACKLIST {
    tag "${sample_id}"
    publishDir "${params.outdir}/peaks", mode: 'copy'
    input:
        tuple val(sample_id), path(peaks)
        path(blacklist)
    output:
        tuple val(sample_id), path("*_filtered.narrowPeak"), emit: peaks
    script:
    """
    bedtools intersect -v \
      -a ${peaks} -b ${blacklist} \
      > ${sample_id}_filtered.narrowPeak
    """
}

process BIGWIG {
    tag "${sample_id}"
    publishDir "${params.outdir}/bigwig", mode: 'copy'
    cpus 8
    input:  tuple val(sample_id), path(bam), path(bai)
    output: path("*.normalized.bw"), emit: bw
    script:
    """
    bamCoverage -b ${bam} -o ${sample_id}.normalized.bw \
      --normalizeUsing RPGC \
      --effectiveGenomeSize 2913022398 \
      --binSize 10 --numberOfProcessors ${task.cpus} \
      --extendReads 200
    """
}

process MULTIQC {
    publishDir "${params.outdir}/multiqc", mode: 'copy'
    input:  path('*')
    output: path("multiqc_report.html")
    script:
    """
    multiqc . -o . --force
    """
}

workflow {
    reads_ch     = Channel.fromFilePairs(params.reads, checkIfExists: true)
    bt2_index_ch = Channel.value(file(params.bt2_index))
    blacklist_ch = Channel.value(file(params.blacklist))

    // Parse samplesheet to get treatment-control pairs
    samplesheet = Channel.fromPath(params.input_csv)
        .splitCsv(header: true)
        .map { row -> [row.sample_id, row.control_id] }

    FASTP(reads_ch)
    BOWTIE2_ALIGN(FASTP.out.trimmed, bt2_index_ch)
    FILTER_DEDUP(BOWTIE2_ALIGN.out.bam)

    // Pair treatment with control BAMs
    treatment_ch = FILTER_DEDUP.out.bam
        .join(samplesheet)
    control_bams = FILTER_DEDUP.out.bam
    paired_ch = treatment_ch
        .map { sample_id, bam, bai, control_id -> [control_id, sample_id, bam, bai] }
        .combine(control_bams, by: 0)
        .map { control_id, sample_id, treat_bam, treat_bai, ctrl_bam, ctrl_bai ->
            [[sample_id, treat_bam, treat_bai], [control_id, ctrl_bam, ctrl_bai]]
        }

    MACS2_CALLPEAK(
        paired_ch.map { it[0] },
        paired_ch.map { it[1] }
    )
    FILTER_BLACKLIST(MACS2_CALLPEAK.out.peaks, blacklist_ch)
    BIGWIG(FILTER_DEDUP.out.bam)

    ch_multiqc = FASTP.out.json.collect()
    MULTIQC(ch_multiqc)
}

ChIP-seq

Overview

Pipeline Steps

Implementation

See Also