Skip to content

Commit ec3c346

Browse files
committed
fix and test for files with no gene id in attributes
1 parent 805c152 commit ec3c346

9 files changed

Lines changed: 86 additions & 34 deletions

File tree

.gitlab-ci.yml

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ docker-run:
4545
- MATRIX_NAME: [
4646
"fusions", "differential_expression", "isoforms",
4747
"only_differential_expression", "differential_expression_gff3",
48-
"ncbi_gzip", "denovo"
48+
"ncbi_gzip", "denovo", "ncbi_no_gene_id", "ensembl_with_versions"
4949
]
5050
rules:
5151
# NOTE As we're overriding the rules block for the included docker-run
@@ -109,15 +109,41 @@ docker-run:
109109
build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam
110110
- if: $MATRIX_NAME == "ncbi_gzip"
111111
variables:
112-
NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
112+
NF_BEFORE_SCRIPT: wget -O differential_expression_ncbi.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression_ncbi.tar.gz && tar -xzvf differential_expression_ncbi.tar.gz
113113
NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 16GB \
114-
--fastq differential_expression/differential_expression_fastq \
114+
--fastq differential_expression_ncbi/differential_expression_fastq \
115115
--transcriptome-source precomputed \
116116
--de_analysis \
117-
--ref_genome differential_expression/GRCh38.p14.NCBI_test.fna.gz \
118-
--ref_annotation differential_expression/GRCh38.p14_NCBI_test.gtf.gz \
117+
--ref_genome differential_expression_ncbi/GRCh38.p14.NCBI_test.fna.gz \
118+
--ref_annotation differential_expression_ncbi/GRCh38.p14_NCBI_test.gtf.gz \
119119
--direct_rna --minimap_index_opts '-w 25' \
120120
--transcriptome_assembly false --sample_sheet test_data/sample_sheet.csv"
121121
NF_IGNORE_PROCESSES: >
122122
preprocess_reads,merge_transcriptomes,assemble_transcripts,
123123
build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam
124+
- if: $MATRIX_NAME == "ncbi_no_gene_id"
125+
variables:
126+
NF_BEFORE_SCRIPT: wget -O differential_expression_ncbi.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression_ncbi.tar.gz && tar -xzvf differential_expression_ncbi.tar.gz
127+
NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 16GB \
128+
--fastq differential_expression_ncbi/differential_expression_fastq \
129+
--transcriptome-source precomputed --de_analysis \
130+
--ref_genome differential_expression_ncbi/GCF_000001405.40_GRCh38.p14_genomic.fna.gz \
131+
--ref_annotation differential_expression_ncbi/GCF_000001405.40_GRCh38.p14_genomic.gff.gz \
132+
--direct_rna --ref_transcriptome differential_expression_ncbi/GCF_000001405.40_GRCh38.p14_rna.fna.gz \
133+
--transcriptome_assembly false --sample_sheet test_data/sample_sheet.csv"
134+
NF_IGNORE_PROCESSES: >
135+
preprocess_reads,merge_transcriptomes,assemble_transcripts,
136+
build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam
137+
- if: $MATRIX_NAME == "ensembl_with_versions"
138+
variables:
139+
NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
140+
NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 16GB \
141+
--fastq differential_expression/differential_expression_fastq \
142+
--transcriptome-source precomputed --de_analysis \
143+
--ref_genome differential_expression/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz \
144+
--ref_annotation differential_expression/Homo_sapiens.GRCh38.109.gtf.gz \
145+
--direct_rna --ref_transcriptome differential_expression/Homo_sapiens.GRCh38.cdna.all.fa.gz \
146+
--transcriptome_assembly false --sample_sheet test_data/sample_sheet.csv"
147+
NF_IGNORE_PROCESSES: >
148+
preprocess_reads,merge_transcriptomes,assemble_transcripts,
149+
build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7-
## [unreleased]
7+
## [v0.2.1]
88
### Changed
99
- Any sample aliases that contain spaces will be replaced with underscores.
10+
- Updated documentation to explain we only support Ensembl, NCBI and ENCODE annotation file types.
1011

1112
### Fixed
1213
- Documentation parameter examples corrected.
14+
- Handling for annotation files that use gene as gene_id attribute.
15+
- Handling for Ensembl annotation files.
1316

1417
## [v0.2.0]
1518
### Changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ Differential gene expression is sensitive to the input data quantity and quality
8282
- Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples
8383
(in fastq/fastq.gz format)
8484
- Reference genome in fasta format (required for reference-based assembly).
85-
- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`).
85+
- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`). Only annotation files from [Encode](https://www.encodeproject.org), [Ensembl](https://www.ensembl.org/index.html) and [NCBI](https://www.ncbi.nlm.nih.gov/) are supported.
8686
- For fusion detection, JAFFAL reference files (see Quickstart)
8787

8888

bin/de_analysis.R

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ min_samps_feature_expr <- args[3]
99
min_gene_expr <- args[4]
1010
min_feature_expr <- args[5]
1111
annotation_type <- args[6]
12+
strip_version <- args[7]
1213

1314
cat("Loading counts, conditions and parameters.\n")
1415
cts <- as.matrix(read.csv("merged/all_counts.tsv", sep="\t", row.names="Reference", stringsAsFactors=FALSE))
@@ -27,15 +28,10 @@ txdf <- select(txdb, keys(txdb,"GENEID"), "TXNAME", "GENEID")
2728
tab <- table(txdf$GENEID)
2829
txdf$ntx<- tab[match(txdf$GENEID, names(tab))]
2930

30-
strip_version<-function(x) {
31-
tmp<-data.frame(strsplit(x,".", fixed=TRUE), stringsAsFactors=FALSE)
32-
tmp<-as.vector(tmp[1,])
33-
colnames(tmp) <- c()
34-
rownames(tmp) <- c()
35-
return(tmp)
36-
}
3731

38-
#rownames(cts) <- strip_version(rownames(cts))
32+
if (strip_version == "true"){
33+
rownames(cts) <- lapply(rownames(cts), sub, pattern = "\\.\\d+$", replacement = "")
34+
}
3935

4036
cts <- cts[rownames(cts) %in% txdf$TXNAME, ] # FIXME: filter for transcripts which are in the annotation. Why they are not all there?
4137

bin/workflow_glue/de_plots.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,9 @@ def dexseq_section(dexseq_file, section, id_dic):
173173
section.markdown(dexseq_caption)
174174
dexseq_results = pd.read_csv(dexseq_file, sep='\t')
175175
dexseq_results.index.name = "gene_id:trancript_id"
176+
# Replace gene id with more useful gene name where possible
176177
dexseq_results.index = dexseq_results.index.map(
177-
lambda x: id_dic[x.split(':')[0]] + ':' + str(x.split(':')[1]))
178+
lambda x: str(id_dic.get(x.split(':')[0])) + ':' + str(x.split(':')[1]))
178179
dexseq_pvals = dexseq_results.sort_values(by='pvalue', ascending=True)
179180
section.table(dexseq_results.loc[dexseq_pvals.index], index=True)
180181
section.markdown("""
@@ -220,8 +221,10 @@ def dexseq_section(dexseq_file, section, id_dic):
220221
def dtu_section(dtu_file, section, gt_dic, ge_dic):
221222
"""Plot dtu section."""
222223
dtu_results = pd.read_csv(dtu_file, sep='\t')
223-
dtu_results["gene_name"] = dtu_results["txID"].apply(lambda x: gt_dic[x])
224-
dtu_results["geneID"] = dtu_results["geneID"].apply(lambda x: ge_dic[x])
224+
dtu_results["gene_name"] = dtu_results["txID"].apply(
225+
lambda x: gt_dic.get(x))
226+
dtu_results["geneID"] = dtu_results["geneID"].apply(
227+
lambda x: ge_dic.get(x))
225228
dtu_pvals = dtu_results.sort_values(by='gene', ascending=True)
226229
dtu_caption = '''Table showing gene and transcript identifiers
227230
and their FDR corrected probabilities
@@ -248,8 +251,8 @@ def dge_section(dge_file, section, ids_dic):
248251
and the false discovery corrected p-value (FDR). This table has not been
249252
filtered for genes that satisfy statistical or magnitudinal thresholds"""
250253
section.markdown(dge_caption)
251-
dge_results.index = dge_results.index.map(lambda x: ids_dic[x])
252-
dge_pvals.index = dge_pvals.index.map(lambda x: ids_dic[x])
254+
dge_results.index = dge_results.index.map(lambda x: ids_dic.get(x))
255+
dge_pvals.index = dge_pvals.index.map(lambda x: ids_dic.get(x))
253256
section.table(dge_results.loc[dge_pvals.index], index=True)
254257
dge = pd.read_csv(dge_file, sep="\t")
255258
section.markdown("""
@@ -317,19 +320,31 @@ def get_feature(row, feature):
317320
for i in fn:
318321
if i.startswith("#"):
319322
continue
320-
try:
321-
gene_name = get_feature(i, 'gene_name')
322-
except IndexError:
323+
# Different gtf/gff formats contain different attributes
324+
# and different formating (eg. gene_name="xyz" or gene_name "xyz")
325+
if 'gene_name' in i:
326+
gene_name = get_feature(i, "gene_name")
327+
elif 'gene_id' in i:
323328
gene_name = get_feature(i, 'gene_id')
324-
try:
329+
elif 'gene' in i:
330+
gene_name = get_feature(i, "gene")
331+
else:
332+
continue
333+
334+
if 'ref_gene_id' in i:
325335
gene_reference = get_feature(i, 'ref_gene_id')
326-
except IndexError:
336+
elif 'gene_id' in i:
327337
gene_reference = get_feature(i, 'gene_id')
328-
try:
338+
else:
339+
gene_reference = gene_name
340+
if 'transcript_id' in i:
329341
transcript_id = get_feature(i, 'transcript_id')
330-
except IndexError:
342+
else:
331343
transcript_id = "unknown"
332-
gene_id = get_feature(i, 'gene_id')
344+
if 'gene_id' in i:
345+
gene_id = get_feature(i, 'gene_id')
346+
else:
347+
gene_id = gene_name
333348
gene_txid[transcript_id] = gene_name
334349
gene_geid[gene_id] = gene_reference
335350
return gene_txid, gene_geid

docs/intro.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,5 +70,5 @@ Differential gene expression is sensitive to the input data quantity and quality
7070
- Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples
7171
(in fastq/fastq.gz format)
7272
- Reference genome in fasta format (required for reference-based assembly).
73-
- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`).
73+
- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`). Only annotation files from [Encode](https://www.encodeproject.org), [Ensembl](https://www.ensembl.org/index.html) and [NCBI](https://www.ncbi.nlm.nih.gov/) are supported.
7474
- For fusion detection, JAFFAL reference files (see Quickstart)

nextflow.config

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ manifest {
115115
description = 'Transcriptome analysis including gene fusions, differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.'
116116
mainScript = 'main.nf'
117117
nextflowVersion = '>=22.10.8'
118-
version = 'v0.2.0'
118+
version = 'v0.2.1'
119119
}
120120

121121
executor {

nextflow_schema.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

subworkflows/differential_expression.nf

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ process mergeTPM {
5353
process deAnalysis {
5454
label "isoforms"
5555
errorStrategy "retry"
56-
maxRetries 1
56+
maxRetries 3
5757
input:
5858
path sample_sheet
5959
path merged_tsv
@@ -68,16 +68,28 @@ process deAnalysis {
6868
script:
6969
// Just try both annotation file type because a .gff extension may be gff2(gtf) or gff3
7070
String annotation_type = "gtf"
71+
String strip_version = "false"
7172
if (task.attempt == 2){
7273
annotation_type = "gff3"
74+
strip_version = "false"
75+
log.info("Retry deAnalysis with gff format setting.")
7376
}
77+
else if (task.attempt == 3){
78+
annotation_type = "gff3"
79+
strip_version = "true"
80+
log.info("Retry deAnalysis with gff format setting and version removal.")
81+
}
82+
else if (task.attempt == 4){
83+
strip_version = "true"
84+
log.info("Retry deAnalysis with gtf format setting and version removal.")
85+
}
86+
7487
"""
7588
mkdir merged
7689
mkdir de_analysis
7790
mv $merged_tsv merged/all_counts.tsv
7891
mv $sample_sheet de_analysis/coldata.tsv
79-
de_analysis.R annotation.gtf $params.min_samps_gene_expr $params.min_samps_feature_expr $params.min_gene_expr $params.min_feature_expr $annotation_type
80-
92+
de_analysis.R annotation.gtf $params.min_samps_gene_expr $params.min_samps_feature_expr $params.min_gene_expr $params.min_feature_expr $annotation_type $strip_version
8193
"""
8294
}
8395

0 commit comments

Comments
 (0)