Skip to content

Commit 5545e27

Browse files
committed
Add long-read barcode assignment via pbmm2
1 parent 451a049 commit 5545e27

4 files changed

Lines changed: 181 additions & 0 deletions

File tree

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
version: "0.6"
2+
3+
assignments:
4+
exampleLongRead:
5+
bc_length: 15
6+
long_read_input: "path/to/reads.bam"
7+
design_file: "path/to/design.fa"
8+
alignment_tool:
9+
split_number: 1
10+
tool: "pbmm2"
11+
configs:
12+
preset: "SUBREAD"
13+
min_concordance: 100.0
14+
pattern: "GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG"
15+
design_check:
16+
fast: true
17+
sequence_collisions: false
18+
strand_sensitive:
19+
enable: false
20+
configs:
21+
default:
22+
min_support: 1
23+
fraction: 0.0

workflow/envs/pbmm2_pysam.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
dependencies:
5+
- pbmm2
6+
- pysam
7+
- biopython
8+
- python>=3.10
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
rule assignment_mapping_pbmm2_index:
2+
"""
3+
Create pbmm2 index from design reference.
4+
"""
5+
conda:
6+
getCondaEnv("pbmm2_pysam.yaml")
7+
input:
8+
ref="results/assignment/{assignment}/reference/reference.fa",
9+
check="results/assignment/{assignment}/design_check.done",
10+
output:
11+
"results/assignment/{assignment}/reference/reference.fa.mmi",
12+
log:
13+
temp("results/logs/assignment/mapping_pbmm2_index.{assignment}.log"),
14+
shell:
15+
"""
16+
pbmm2 index {input.ref} {output} &> {log}
17+
"""
18+
19+
20+
rule assignment_mapping_pbmm2_align:
21+
"""
22+
Align long reads (BAM or FASTA) to reference using pbmm2.
23+
"""
24+
conda:
25+
getCondaEnv("pbmm2_pysam.yaml")
26+
threads: 8
27+
input:
28+
reads=lambda wc: config["assignments"][wc.assignment]["long_read_input"],
29+
index="results/assignment/{assignment}/reference/reference.fa.mmi",
30+
output:
31+
temp("results/assignment/{assignment}/pbmm2/aligned.bam"),
32+
params:
33+
preset=lambda wc: config["assignments"][wc.assignment]["alignment_tool"]["configs"].get("preset", "SUBREAD"),
34+
min_concordance=lambda wc: config["assignments"][wc.assignment]["alignment_tool"]["configs"].get("min_concordance", 100.0),
35+
log:
36+
temp("results/logs/assignment/mapping_pbmm2_align.{assignment}.log"),
37+
shell:
38+
"""
39+
pbmm2 align {input.index} {input.reads} {output} \
40+
--preset {params.preset} \
41+
--sort \
42+
--best-n 1 \
43+
--min-concordance-perc {params.min_concordance} \
44+
-j {threads} &> {log}
45+
"""
46+
47+
48+
rule assignment_mapping_pbmm2_getBCs:
49+
"""
50+
Extract barcodes from aligned long reads. Produces the standard
51+
barcode TSV for downstream collection and filtering.
52+
"""
53+
conda:
54+
getCondaEnv("pbmm2_pysam.yaml")
55+
input:
56+
bam="results/assignment/{assignment}/pbmm2/aligned.bam",
57+
script=getScript("assignment/longread_extract.py"),
58+
output:
59+
temp("results/assignment/{assignment}/BCs/barcodes.pbmm2.0.tsv"),
60+
params:
61+
pattern=lambda wc: config["assignments"][wc.assignment]["alignment_tool"]["configs"].get(
62+
"pattern", "GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG"
63+
),
64+
bc_length=lambda wc: config["assignments"][wc.assignment]["bc_length"],
65+
log:
66+
temp("results/logs/assignment/mapping_pbmm2_getBCs.{assignment}.log"),
67+
shell:
68+
"""
69+
export LC_ALL=C
70+
python {input.script} \
71+
--bam {input.bam} \
72+
--pattern {params.pattern} \
73+
--bc-length {params.bc_length} \
74+
--output /dev/stdout 2> {log} | \
75+
sort -k1,1 -k2,2 -k3,3 -S 7G > {output}
76+
"""
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""Extract barcodes from aligned long-read BAM.
2+
3+
Outputs a 3-column TSV (barcode, reference, metadata) matching the
4+
MPRAsnakeflow assignment format used by assignment_collectBCs and
5+
assignment_filter.
6+
"""
7+
8+
import argparse
9+
import logging
10+
import sys
11+
12+
import pysam
13+
14+
logging.basicConfig(
15+
level=logging.INFO,
16+
format="%(asctime)s [%(levelname)s] %(message)s",
17+
handlers=[logging.StreamHandler(sys.stderr)],
18+
)
19+
logger = logging.getLogger(__name__)
20+
21+
22+
def extract_barcode(seq, pattern, bc_length):
23+
"""Return barcode downstream of pattern, or None."""
24+
idx = seq.find(pattern)
25+
if idx == -1:
26+
return None
27+
start = idx + len(pattern)
28+
if start + bc_length > len(seq):
29+
return None
30+
return seq[start : start + bc_length]
31+
32+
33+
def main():
34+
parser = argparse.ArgumentParser(description="Extract barcodes from aligned long-read BAM")
35+
parser.add_argument("--bam", required=True, help="Aligned BAM from pbmm2")
36+
parser.add_argument("--pattern", required=True, help="Fixed sequence upstream of barcode")
37+
parser.add_argument("--bc-length", type=int, required=True, help="Barcode length")
38+
parser.add_argument("--output", required=True, help="Output TSV")
39+
args = parser.parse_args()
40+
41+
stats = {"total": 0, "mapped": 0, "unmapped": 0, "found": 0, "missing": 0}
42+
43+
out_fh = sys.stdout if args.output == "/dev/stdout" else open(args.output, "w")
44+
with pysam.AlignmentFile(args.bam, "rb") as bam, out_fh as out:
45+
for read in bam.fetch():
46+
stats["total"] += 1
47+
48+
if read.is_unmapped:
49+
stats["unmapped"] += 1
50+
continue
51+
52+
stats["mapped"] += 1
53+
seq = read.query_sequence
54+
if seq is None:
55+
continue
56+
57+
bc = extract_barcode(seq, args.pattern, args.bc_length)
58+
if bc:
59+
stats["found"] += 1
60+
cigar = read.cigarstring or "NA"
61+
mapq = read.mapping_quality
62+
out.write(f"{bc}\t{read.reference_name}\t{cigar};{mapq}\n")
63+
else:
64+
stats["missing"] += 1
65+
66+
t, m = stats["total"], stats["mapped"]
67+
logger.info("Total reads: %s", f"{t:,}")
68+
logger.info(" Mapped: %s (%.1f%%)", f"{m:,}", m / t * 100 if t else 0)
69+
logger.info(" Pattern found: %s (%.1f%%)", f"{stats['found']:,}", stats["found"] / m * 100 if m else 0)
70+
logger.info(" Pattern missing: %s", f"{stats['missing']:,}")
71+
72+
73+
if __name__ == "__main__":
74+
main()

0 commit comments

Comments
 (0)