pyflow-ChIPseq/config.yaml at master · crazyhottommy/pyflow-ChIPseq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# =====================================================================
# pyflow-ChIPseq Configuration File (Modernized 2025)
# =====================================================================

# ---------------------------------------------------------------------
# User Information
# ---------------------------------------------------------------------
# Email for notifications (optional - used by cluster systems)
email: [email protected]

# ---------------------------------------------------------------------
# Pipeline Mode
# ---------------------------------------------------------------------
# Start from FASTQ files (True) or pre-aligned BAM files (False)
from_fastq: True

# Paired-end (True) or single-end (False) sequencing
paired_end: False

# Read length: >70bp (True) or <70bp (False)
# This determines whether to use bwa mem (long) or bwa aln (short)
long_reads: True

# ---------------------------------------------------------------------
# Sample Configuration
# ---------------------------------------------------------------------
# Name of the control sample (e.g., "Input", "IgG")
control: 'Input'

# Path to samples JSON file (generated by sample2json.py)
SAMPLES_JSON: './samples.json'

# Legacy samples file (not used in modernized version)
SAMPLES: './SRR.txt'

# ---------------------------------------------------------------------
# Reference Genome
# ---------------------------------------------------------------------
# Path to reference genome FASTA file
# Example for mouse mm10: /path/to/genomes/mm10/mm10.fa
# Example for human hg38: /path/to/genomes/hg38/hg38.fa
ref_fa: ./data/hg38.fa

# Genome size for MACS3 (narrow peaks)
# Options: 'hs' (human), 'mm' (mouse), 'ce' (C. elegans), 'dm' (Drosophila)
# Or specify genome size in bp (e.g., 2.7e9 for human)
macs_g: mm

# Genome size for MACS3 (broad peaks) - usually same as macs_g
macs2_g: mm

# ---------------------------------------------------------------------
# Peak Calling Parameters
# ---------------------------------------------------------------------
# Q-value (minimum FDR) cutoff for MACS3 narrow peak calling
# Default: 0.05 (use smaller values like 0.01 for more stringent calling)
macs_pvalue: 0.05

# Q-value cutoff for MACS3 broad peak calling
macs2_pvalue: 0.05

# Broad peak cutoff (for calling peaks in broad histone marks like H3K27me3)
macs2_pvalue_broad: 0.1

# ---------------------------------------------------------------------
# Downsampling
# ---------------------------------------------------------------------
# Enable downsampling to normalize read depth across samples
downsample: True

# Target number of reads after downsampling (50 million default)
# If a sample has fewer reads, all reads will be kept
target_reads: 50000000

# ---------------------------------------------------------------------
# Computational Resources
# ---------------------------------------------------------------------
# Maximum threads for alignment (BWA)
# Set based on your available CPUs (e.g., 8 for an 8-core system)
# Note: Actual threads used will be min(this value, --cores value)
max_threads_align: 8

# Maximum threads for other multi-threaded tools
max_threads_other: 4

# ---------------------------------------------------------------------
# Quality Control Options
# ---------------------------------------------------------------------
# Run phantompeakqualtools for ChIP-seq quality assessment
# Provides fragment length estimation and quality metrics
# Set to False if you encounter R/Bioconductor dependency issues
run_phantompeakqual: True

# ---------------------------------------------------------------------
# ROSE2 Super-Enhancer Calling (Optional)
# ---------------------------------------------------------------------
# ROSE2 is installed via conda - no path configuration needed
# Genome assembly for ROSE2 (use uppercase: HG38, HG19, MM10, MM9)
# See: https://github.com/crazyhottommy/ROSE2
rose_g: HG38

# ---------------------------------------------------------------------
# ChromHMM Chromatin State Modeling (Optional)
# ---------------------------------------------------------------------
# Enable ChromHMM analysis
chromHMM: False

# ChromHMM bin size (bp) for discretizing genome
binsize: 200

# Number of chromatin states to learn
state: 15

# Genome assembly for ChromHMM (must match chromosome sizes file)
chromHmm_g: hg38

# Histone marks to include in ChromHMM model
# Space-delimited list matching factor names in your metadata
# Example: "H3K4me1 H3K4me3 H3K27ac H3K27me3 H3K9me3"
histone_for_chromHMM: "K4me1 K4me3 K27ac K27me3"

# ---------------------------------------------------------------------
# Legacy Settings (deprecated in modernized version)
# ---------------------------------------------------------------------
# cluster.json is no longer used - use Snakemake profiles instead
# See profiles/slurm/ directory for cluster configuration
CLUSTER_JSON: './cluster.json'