fix and test for files with no gene id in attributes

sarahjeeeze · sarahjeeeze · commit ec3c346fd8ea · 2023-07-26T18:14:00.000Z
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -45,7 +45,7 @@ docker-run:
             - MATRIX_NAME: [
                 "fusions", "differential_expression", "isoforms",
                 "only_differential_expression", "differential_expression_gff3",
-                "ncbi_gzip", "denovo"
+                "ncbi_gzip", "denovo", "ncbi_no_gene_id", "ensembl_with_versions"
             ]
     rules:
         # NOTE As we're overriding the rules block for the included docker-run
@@ -109,15 +109,41 @@ docker-run:
                 build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam
         - if: $MATRIX_NAME == "ncbi_gzip"
           variables:
-              NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
+              NF_BEFORE_SCRIPT: wget -O differential_expression_ncbi.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression_ncbi.tar.gz && tar -xzvf differential_expression_ncbi.tar.gz
               NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 16GB \
---fastq  differential_expression/differential_expression_fastq \
+                  --fastq  differential_expression_ncbi/differential_expression_fastq \
                   --transcriptome-source precomputed \
                   --de_analysis \
-                  --ref_genome differential_expression/GRCh38.p14.NCBI_test.fna.gz \
-                --ref_annotation differential_expression/GRCh38.p14_NCBI_test.gtf.gz \
+                  --ref_genome differential_expression_ncbi/GRCh38.p14.NCBI_test.fna.gz \
+                --ref_annotation differential_expression_ncbi/GRCh38.p14_NCBI_test.gtf.gz \
                 --direct_rna --minimap_index_opts '-w 25' \
                --transcriptome_assembly false --sample_sheet test_data/sample_sheet.csv"
               NF_IGNORE_PROCESSES: >
                 preprocess_reads,merge_transcriptomes,assemble_transcripts,
                 build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam           
+        - if: $MATRIX_NAME == "ncbi_no_gene_id"
+          variables:
+              NF_BEFORE_SCRIPT: wget -O differential_expression_ncbi.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression_ncbi.tar.gz && tar -xzvf differential_expression_ncbi.tar.gz
+              NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 16GB \
+                --fastq differential_expression_ncbi/differential_expression_fastq \
+                --transcriptome-source precomputed --de_analysis \
+                --ref_genome differential_expression_ncbi/GCF_000001405.40_GRCh38.p14_genomic.fna.gz \
+                --ref_annotation differential_expression_ncbi/GCF_000001405.40_GRCh38.p14_genomic.gff.gz \
+                --direct_rna --ref_transcriptome differential_expression_ncbi/GCF_000001405.40_GRCh38.p14_rna.fna.gz \
+                --transcriptome_assembly false --sample_sheet test_data/sample_sheet.csv"
+              NF_IGNORE_PROCESSES: >
+                preprocess_reads,merge_transcriptomes,assemble_transcripts,
+                build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam           
+        - if: $MATRIX_NAME == "ensembl_with_versions"
+          variables:
+              NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
+              NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 16GB \
+                --fastq differential_expression/differential_expression_fastq \
+                --transcriptome-source precomputed --de_analysis \
+                --ref_genome differential_expression/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz \
+                --ref_annotation differential_expression/Homo_sapiens.GRCh38.109.gtf.gz \
+                --direct_rna --ref_transcriptome differential_expression/Homo_sapiens.GRCh38.cdna.all.fa.gz \
+                --transcriptome_assembly false --sample_sheet test_data/sample_sheet.csv"
+              NF_IGNORE_PROCESSES: >
+                preprocess_reads,merge_transcriptomes,assemble_transcripts,
+                build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam           
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,12 +4,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [unreleased]
+## [v0.2.1]
 ### Changed
 - Any sample aliases that contain spaces will be replaced with underscores.
+- Updated documentation to explain we only support Ensembl, NCBI and ENCODE annotation file types. 
 
 ### Fixed
 - Documentation parameter examples corrected.
+- Handling for annotation files that use gene as gene_id attribute.
+- Handling for Ensembl annotation files.
 
 ## [v0.2.0]
 ### Changed
diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ Differential gene expression is sensitive to the input data quantity and quality
 - Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples
   (in fastq/fastq.gz format)
 - Reference genome in fasta format (required for reference-based assembly).
-- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`).
+- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`). Only annotation files from [Encode](https://www.encodeproject.org), [Ensembl](https://www.ensembl.org/index.html) and [NCBI](https://www.ncbi.nlm.nih.gov/) are supported.
 - For fusion detection, JAFFAL reference files (see Quickstart) 
 
 
diff --git a/bin/de_analysis.R b/bin/de_analysis.R
@@ -9,6 +9,7 @@ min_samps_feature_expr <- args[3]
 min_gene_expr <- args[4] 
 min_feature_expr <- args[5]
 annotation_type <- args[6]
+strip_version <- args[7]
 
 cat("Loading counts, conditions and parameters.\n")
 cts <- as.matrix(read.csv("merged/all_counts.tsv", sep="\t", row.names="Reference", stringsAsFactors=FALSE))
@@ -27,15 +28,10 @@ txdf <- select(txdb, keys(txdb,"GENEID"), "TXNAME", "GENEID")
 tab <- table(txdf$GENEID)
 txdf$ntx<- tab[match(txdf$GENEID, names(tab))]
 
-strip_version<-function(x) {
-    tmp<-data.frame(strsplit(x,".", fixed=TRUE), stringsAsFactors=FALSE)
-    tmp<-as.vector(tmp[1,])
-    colnames(tmp) <- c()
-    rownames(tmp) <- c()
-    return(tmp)
-}
 
-#rownames(cts) <- strip_version(rownames(cts))
+if (strip_version == "true"){
+  rownames(cts) <- lapply(rownames(cts),  sub, pattern = "\\.\\d+$", replacement = "")
+}
 
 cts <- cts[rownames(cts) %in% txdf$TXNAME, ] # FIXME: filter for transcripts which are in the annotation. Why they are not all there? 
 
diff --git a/bin/workflow_glue/de_plots.py b/bin/workflow_glue/de_plots.py
@@ -173,8 +173,9 @@ def dexseq_section(dexseq_file, section, id_dic):
     section.markdown(dexseq_caption)
     dexseq_results = pd.read_csv(dexseq_file, sep='\t')
     dexseq_results.index.name = "gene_id:trancript_id"
+    # Replace gene id with more useful gene name where possible
     dexseq_results.index = dexseq_results.index.map(
-        lambda x: id_dic[x.split(':')[0]] + ':' + str(x.split(':')[1]))
+        lambda x: str(id_dic.get(x.split(':')[0])) + ':' + str(x.split(':')[1]))
     dexseq_pvals = dexseq_results.sort_values(by='pvalue', ascending=True)
     section.table(dexseq_results.loc[dexseq_pvals.index], index=True)
     section.markdown("""
@@ -220,8 +221,10 @@ def dexseq_section(dexseq_file, section, id_dic):
 def dtu_section(dtu_file, section, gt_dic, ge_dic):
     """Plot dtu section."""
     dtu_results = pd.read_csv(dtu_file, sep='\t')
-    dtu_results["gene_name"] = dtu_results["txID"].apply(lambda x: gt_dic[x])
-    dtu_results["geneID"] = dtu_results["geneID"].apply(lambda x: ge_dic[x])
+    dtu_results["gene_name"] = dtu_results["txID"].apply(
+        lambda x: gt_dic.get(x))
+    dtu_results["geneID"] = dtu_results["geneID"].apply(
+        lambda x: ge_dic.get(x))
     dtu_pvals = dtu_results.sort_values(by='gene', ascending=True)
     dtu_caption = '''Table showing gene and transcript identifiers
     and their FDR corrected probabilities
@@ -248,8 +251,8 @@ def dge_section(dge_file, section, ids_dic):
 and the false discovery corrected p-value (FDR). This table has not been
 filtered for genes that satisfy statistical or magnitudinal thresholds"""
     section.markdown(dge_caption)
-    dge_results.index = dge_results.index.map(lambda x: ids_dic[x])
-    dge_pvals.index = dge_pvals.index.map(lambda x: ids_dic[x])
+    dge_results.index = dge_results.index.map(lambda x: ids_dic.get(x))
+    dge_pvals.index = dge_pvals.index.map(lambda x: ids_dic.get(x))
     section.table(dge_results.loc[dge_pvals.index], index=True)
     dge = pd.read_csv(dge_file, sep="\t")
     section.markdown("""
@@ -317,19 +320,31 @@ def get_feature(row, feature):
     for i in fn:
         if i.startswith("#"):
             continue
-        try:
-            gene_name = get_feature(i, 'gene_name')
-        except IndexError:
+        # Different gtf/gff formats contain different attributes
+        # and different formating (eg. gene_name="xyz" or gene_name "xyz")
+        if 'gene_name' in i:
+            gene_name = get_feature(i, "gene_name")
+        elif 'gene_id' in i:
             gene_name = get_feature(i, 'gene_id')
-        try:
+        elif 'gene' in i:
+            gene_name = get_feature(i, "gene")
+        else:
+            continue
+
+        if 'ref_gene_id' in i:
             gene_reference = get_feature(i, 'ref_gene_id')
-        except IndexError:
+        elif 'gene_id' in i:
             gene_reference = get_feature(i, 'gene_id')
-        try:
+        else:
+            gene_reference = gene_name
+        if 'transcript_id' in i:
             transcript_id = get_feature(i, 'transcript_id')
-        except IndexError:
+        else:
             transcript_id = "unknown"
-        gene_id = get_feature(i, 'gene_id')
+        if 'gene_id' in i:
+            gene_id = get_feature(i, 'gene_id')
+        else:
+            gene_id = gene_name
         gene_txid[transcript_id] = gene_name
         gene_geid[gene_id] = gene_reference
     return gene_txid, gene_geid
diff --git a/docs/intro.md b/docs/intro.md
@@ -70,5 +70,5 @@ Differential gene expression is sensitive to the input data quantity and quality
 - Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples
   (in fastq/fastq.gz format)
 - Reference genome in fasta format (required for reference-based assembly).
-- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`).
+- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`). Only annotation files from [Encode](https://www.encodeproject.org), [Ensembl](https://www.ensembl.org/index.html) and [NCBI](https://www.ncbi.nlm.nih.gov/) are supported.
 - For fusion detection, JAFFAL reference files (see Quickstart) 
diff --git a/nextflow.config b/nextflow.config
@@ -115,7 +115,7 @@ manifest {
     description     = 'Transcriptome analysis including gene fusions, differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=22.10.8'
-    version         = 'v0.2.0'
+    version         = 'v0.2.1'
 }
 
 executor {
diff --git a/nextflow_schema.json b/nextflow_schema.json
diff --git a/subworkflows/differential_expression.nf b/subworkflows/differential_expression.nf
@@ -53,7 +53,7 @@ process mergeTPM {
 process deAnalysis {
     label "isoforms"
     errorStrategy "retry"
-    maxRetries 1
+    maxRetries 3
     input:
         path sample_sheet
         path merged_tsv 
@@ -68,16 +68,28 @@ process deAnalysis {
     script:
     // Just try both annotation file type because a .gff extension may be gff2(gtf) or gff3
     String annotation_type = "gtf"
+    String strip_version  = "false"
     if (task.attempt == 2){
         annotation_type = "gff3"
+        strip_version  = "false"
+        log.info("Retry deAnalysis with gff format setting.")
     }
+    else if (task.attempt == 3){
+        annotation_type = "gff3"
+        strip_version  = "true"
+        log.info("Retry deAnalysis with gff format setting and version removal.")
+    }
+    else if (task.attempt == 4){
+        strip_version  = "true"
+        log.info("Retry deAnalysis with gtf format setting and version removal.")
+    }
+
     """
     mkdir merged
     mkdir de_analysis
     mv $merged_tsv merged/all_counts.tsv
     mv $sample_sheet de_analysis/coldata.tsv
-    de_analysis.R annotation.gtf $params.min_samps_gene_expr $params.min_samps_feature_expr $params.min_gene_expr $params.min_feature_expr $annotation_type
-   
+    de_analysis.R annotation.gtf $params.min_samps_gene_expr $params.min_samps_feature_expr $params.min_gene_expr $params.min_feature_expr $annotation_type $strip_version
     """
 }
 

Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ manifest {`
`115`	`115`	`description = 'Transcriptome analysis including gene fusions, differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.'`
`116`	`116`	`mainScript = 'main.nf'`
`117`	`117`	`nextflowVersion = '>=22.10.8'`
`118`		`- version = 'v0.2.0'`
	`118`	`+ version = 'v0.2.1'`
`119`	`119`	`}`
`120`	`120`
`121`	`121`	`executor {`