Skip to content

Commit ce47836

Browse files
authored
fix: consider masked sites for dN and dS calculation (#38)
* mask sites before dN and dS calculation * fix import * move and rename rule * update Dockerfile with bedtools environment * filter problematic sites * merge rules to build BED * update Dockerfile * update showtext dpi equals the value in individual plots * avoid showtext dependency ggplot falls back to the default font if Noto Sans is unavailable. showtext started causing a cryptic segfault
1 parent fa5f624 commit ce47836

9 files changed

Lines changed: 108 additions & 27 deletions

File tree

Dockerfile

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
FROM condaforge/miniforge3:latest
22
LABEL io.github.snakemake.containerized="true"
3-
LABEL io.github.snakemake.conda_env_hash="ed26b44bdff1bc9f7a99d3830ae812bf262f78427b95a45fa1ac1ce255c5f054"
3+
LABEL io.github.snakemake.conda_env_hash="fb2a04b98efa75d8b830a16722a8717eb91a3c66659e3c78b90b0cee3a50db69"
44

55
# Step 2: Retrieve conda environments
66

@@ -117,12 +117,12 @@ COPY workflow/envs/quarto_render.yaml /conda-envs/96f3c1cec4b3ce5d72f708992272e9
117117

118118
# Conda environment:
119119
# source: workflow/envs/renv.yaml
120-
# prefix: /conda-envs/8ad6cdcf265d30289788da99d5bf9fff
120+
# prefix: /conda-envs/fe892aca096e6b2883923c8755f9ac77
121121
# channels:
122122
# - conda-forge
123123
# - bioconda
124124
# dependencies:
125-
# - r-base=4.3.2
125+
# - r-base=4.3.3
126126
# - r-tidyverse==2.0.0
127127
# - r-ggrepel==0.9.3
128128
# - r-ggpubr==0.6.0
@@ -133,10 +133,9 @@ COPY workflow/envs/quarto_render.yaml /conda-envs/96f3c1cec4b3ce5d72f708992272e9
133133
# - r-data.table==1.14.8
134134
# - r-future.apply==1.11.0
135135
# - r-scales==1.3.0
136-
# - r-showtext==0.9_6
137136
# - r-logger==0.2.2
138-
RUN mkdir -p /conda-envs/8ad6cdcf265d30289788da99d5bf9fff
139-
COPY workflow/envs/renv.yaml /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/environment.yaml
137+
RUN mkdir -p /conda-envs/fe892aca096e6b2883923c8755f9ac77
138+
COPY workflow/envs/renv.yaml /conda-envs/fe892aca096e6b2883923c8755f9ac77/environment.yaml
140139

141140
# Conda environment:
142141
# source: workflow/envs/snpeff.yaml
@@ -149,6 +148,18 @@ COPY workflow/envs/renv.yaml /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/enviro
149148
RUN mkdir -p /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95
150149
COPY workflow/envs/snpeff.yaml /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/environment.yaml
151150

151+
# Conda environment:
152+
# source: workflow/envs/tools.yaml
153+
# prefix: /conda-envs/1f283441022c3c9d97669994a3c5e8bb
154+
# channels:
155+
# - conda-forge
156+
# - bioconda
157+
# dependencies:
158+
# - bedtools==2.31.1
159+
# - bcftools==1.23
160+
RUN mkdir -p /conda-envs/1f283441022c3c9d97669994a3c5e8bb
161+
COPY workflow/envs/tools.yaml /conda-envs/1f283441022c3c9d97669994a3c5e8bb/environment.yaml
162+
152163
# Conda environment:
153164
# source: workflow/envs/var_calling.yaml
154165
# prefix: /conda-envs/81e46c677a6cc0618c93963d57d17d3f
@@ -173,8 +184,9 @@ RUN conda env create --prefix /conda-envs/9c24a867826615972cc288081976e7fc --fil
173184
conda env create --prefix /conda-envs/04a3347f94ddf7e21c34bc49e5246076 --file /conda-envs/04a3347f94ddf7e21c34bc49e5246076/environment.yaml && \
174185
conda env create --prefix /conda-envs/fb978640cd765c8a63bbcdc01f3a206b --file /conda-envs/fb978640cd765c8a63bbcdc01f3a206b/environment.yaml && \
175186
conda env create --prefix /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1 --file /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1/environment.yaml && \
176-
conda env create --prefix /conda-envs/8ad6cdcf265d30289788da99d5bf9fff --file /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/environment.yaml && \
187+
conda env create --prefix /conda-envs/fe892aca096e6b2883923c8755f9ac77 --file /conda-envs/fe892aca096e6b2883923c8755f9ac77/environment.yaml && \
177188
conda env create --prefix /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95 --file /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/environment.yaml && \
189+
conda env create --prefix /conda-envs/1f283441022c3c9d97669994a3c5e8bb --file /conda-envs/1f283441022c3c9d97669994a3c5e8bb/environment.yaml && \
178190
conda env create --prefix /conda-envs/81e46c677a6cc0618c93963d57d17d3f --file /conda-envs/81e46c677a6cc0618c93963d57d17d3f/environment.yaml && \
179191
conda clean --all -y
180192

config/design_plots.R

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,14 @@
1-
# Jordi Sevilla
2-
31
library(ggplot2)
4-
library(showtext)
5-
6-
# Ajustes ####
7-
showtext_auto(enable = FALSE)
8-
showtext_opts(dpi = 200)
9-
10-
# Tema
11-
font_add_google("Noto Sans", "Noto Sans")
12-
showtext_auto()
132

3+
# Theme
144
theme_set(theme_minimal())
15-
165
theme_update(
17-
text = element_text(size = 16, family = "Noto Sans"),
18-
axis.title = element_text(size = 16),
6+
text = element_text(size = 10, family = "Noto Sans"),
7+
axis.title = element_text(size = 12),
198
axis.line = element_line(
209
linewidth = 0.5,
2110
colour = "grey40",
22-
linetype = 1,
23-
arrow = arrow(length = unit(3, "mm"))
11+
linetype = 1
2412
),
2513
panel.grid = element_line(linewidth = 0.17, color = "lightgray")
2614
)

workflow/envs/bedtools.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
dependencies:
5+
- bedtools==2.31.1

workflow/envs/renv.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ channels:
22
- conda-forge
33
- bioconda
44
dependencies:
5-
- r-base=4.3.2
5+
- r-base=4.3.3
66
- r-tidyverse==2.0.0
77
- r-ggrepel==0.9.3
88
- r-ggpubr==0.6.0
@@ -13,5 +13,4 @@ dependencies:
1313
- r-data.table==1.14.8
1414
- r-future.apply==1.11.0
1515
- r-scales==1.3.0
16-
- r-showtext==0.9_6
1716
- r-logger==0.2.2

workflow/envs/tools.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
dependencies:
5+
- bedtools==2.31.1
6+
- bcftools==1.23

workflow/rules/evolution.smk

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ rule n_s_sites:
2121
gb_qualifier_display = "gene",
2222
input:
2323
fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
24+
masked = OUTDIR / "sites_masked.bed",
2425
gb = OUTDIR/"reference.cds.gb",
2526
genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve(),
2627
output:
@@ -35,6 +36,7 @@ rule calculate_dnds:
3536
conda: "../envs/renv.yaml"
3637
input:
3738
n_s_sites = OUTDIR/f"{OUTPUT_NAME}.ancestor.n_s.sites.csv",
39+
masked = OUTDIR / "sites_masked.bed",
3840
variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
3941
metadata = config["METADATA"]
4042
output:

workflow/rules/sites.smk

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
rule problematic_vcf_to_bed:
2+
conda:
3+
"../envs/tools.yaml"
4+
params:
5+
filters = ["mask"],
6+
input:
7+
vcf = lambda wildcards: select_problematic_vcf(),
8+
output:
9+
vcf = temp(OUTDIR / "sites_masked.vcf"),
10+
bed = temp(OUTDIR / "sites_masked.bed"),
11+
log:
12+
LOGDIR / "problematic_vcf_to_bed" / "log.txt",
13+
shell:
14+
"FILTER_STR=$(echo \"{params.filters}\" | tr ' ' ',') && "
15+
"bcftools view -f \"$FILTER_STR\" {input.vcf} >{output.vcf} 2>{log} && "
16+
"bedtools merge -i {output.vcf} >{output.bed} 2>{log}"
17+
18+
119
rule bcftools_mpileup_all_sites:
220
threads: 1
321
conda: "../envs/var_calling.yaml"

workflow/scripts/calculate_dnds.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ sink(log, type = "output")
88
library(dplyr)
99
library(readr)
1010
library(tidyr)
11+
library(purrr)
1112
library(logger)
1213

1314
log_threshold(INFO)
@@ -37,6 +38,21 @@ metadata <- read_delim(snakemake@input[["metadata"]]) %>%
3738
select(ID, interval) %>%
3839
rename(SAMPLE = ID)
3940

41+
log_info("Reading masked sites BED")
42+
masked <- read_tsv(
43+
snakemake@input$masked,
44+
col_names = c("chrom", "start", "end"),
45+
col_types = "cii",
46+
comment = "#"
47+
) %>%
48+
mutate(POS = map2(start + 1, end, seq.int)) %>%
49+
unnest(POS) %>%
50+
pull(POS)
51+
52+
log_info("Filtering variants on masked sites")
53+
variants <- variants %>%
54+
filter(!POS %in% masked, !is.na(SYNONYMOUS))
55+
4056
log_debug("Adding metadata to variants table")
4157
variants <- left_join(variants, metadata)
4258

workflow/scripts/n_s_sites_from_fasta.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,50 @@
88
import pandas as pd
99
from Bio import SeqIO
1010
from Bio.SeqRecord import SeqRecord
11+
from Bio.SeqFeature import SeqFeature
1112
from Bio.Seq import Seq
1213

1314

1415
NTS = ("A", "C", "G", "T")
16+
MASK_CHR = "N"
1517

1618

1719
def read_monofasta(path: str) -> SeqRecord:
1820
return SeqIO.read(path, format="fasta")
1921

2022

23+
def read_masked_positions(bed_path: str) -> set:
24+
masked = set()
25+
with open(bed_path) as fh:
26+
for line in fh:
27+
if line.startswith("#") or not line.strip():
28+
continue
29+
parts = line.strip().split("\t")
30+
start, end = int(parts[1]), int(parts[2])
31+
masked.update(range(start, end))
32+
return masked
33+
34+
35+
def mask_feature(feature: SeqFeature, record: SeqRecord, masked_positions: set) -> SeqRecord:
36+
extracted = feature.extract(record)
37+
seq_chars = list(str(extracted.seq).upper())
38+
# Handle reverse strand and compound locations
39+
genomic_positions = []
40+
for part in feature.location.parts:
41+
positions = list(range(int(part.start), int(part.end)))
42+
if part.strand == -1:
43+
positions.reverse()
44+
genomic_positions.extend(positions)
45+
# Mask sites on sequence
46+
for i, pos in enumerate(genomic_positions):
47+
if seq_chars[i] not in NTS or pos in masked_positions:
48+
seq_chars[i] = MASK_CHR
49+
extracted.seq = Seq("".join(seq_chars))
50+
return extracted
51+
52+
2153
def split_into_codons(seq: Seq) -> list:
22-
"""Split the complete CDS feature in to a list of codons"""
54+
"""Split the complete CDS feature in to a list of codons (if ACGT only)"""
2355
return [
2456
seq[i:i + 3] for i in range(0, len(seq)-2, 3) if all(char in NTS for char in seq[i:i + 3])
2557
]
@@ -79,6 +111,9 @@ def main():
79111
with open(snakemake.input.genetic_code) as f:
80112
genetic_code = json.load(f)
81113

114+
logging.info("Reading masked sites BED")
115+
masked = read_masked_positions(snakemake.input.masked)
116+
82117
logging.info("Reading GenBank file")
83118
gb = SeqIO.read(snakemake.input.gb, format="gb")
84119

@@ -95,7 +130,7 @@ def main():
95130
logging.warning(f"Identifier '{identifier}' is already among coding records and will not be replaced by feature at {feature.location}")
96131
else:
97132
logging.debug(f"Adding feature")
98-
coding_records[identifier] = feature.extract(record)
133+
coding_records[identifier] = mask_feature(feature, record, masked)
99134

100135
logging.info(f"Splitting {len(coding_records)} records into codons")
101136
codons = get_feature_codons(coding_records)

0 commit comments

Comments
 (0)