Skip to content

Commit 9d0c835

Browse files
committed
Merge branch 'CW-3468_incorrect_ids' into 'dev'
Resolve CW-3468 "Incorrect ids" Closes CW-3468 See merge request epi2melabs/workflows/wf-transcriptomes!157
2 parents 4f67105 + bb9acf5 commit 9d0c835

9 files changed

Lines changed: 79 additions & 128 deletions

File tree

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ repos:
88
always_run: true
99
pass_filenames: false
1010
additional_dependencies:
11-
- epi2melabs>=0.0.51
11+
- epi2melabs>=0.0.52
1212
- id: build_models
1313
name: build_models
1414
entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7-
## [unreleased]
7+
## [v1.1.0]
88
### Changed
99
- Improve documentation around filtering of transcripts done before DTU analysis.
1010
- Renamed files:

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,13 @@ input_reads.fastq ─── input_directory ─── input_directory
122122
| analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False |
123123

124124

125+
### Output Options
126+
127+
| Nextflow parameter name | Type | Description | Help | Default |
128+
|--------------------------|------|-------------|------|---------|
129+
| out_dir | string | Directory for output of all user-facing files. | | output |
130+
131+
125132
### Sample Options
126133

127134
| Nextflow parameter name | Type | Description | Help | Default |

bin/workflow_glue/report.py

Lines changed: 24 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -51,23 +51,17 @@ def argparser():
5151
"--gff_annotation", required=False, nargs='+',
5252
help="transcriptome annotation gff file")
5353
parser.add_argument(
54-
"--gffcompare_dir", required=False, default=None, nargs='*',
54+
"--gffcompare_dir", required=False, default=None,
5555
help="gffcompare outout dir")
5656
parser.add_argument(
5757
"--pychop_report", required=False, default=None,
5858
help="TSV summary file of pychopper statistics")
59-
parser.add_argument(
60-
"--sample_ids", required=True, nargs='+',
61-
help="List of sample ids")
6259
parser.add_argument(
6360
"--isoform_table", required=False, type=Path,
6461
help="Path to directory of TSV files with isoform summaries")
6562
parser.add_argument(
6663
"--isoform_table_nrows", required=False, type=int, default=5000,
6764
help="Maximum rows to display in isoforms table")
68-
parser.add_argument(
69-
"--cluster_qc_dirs", required=False, type=str, default=None, nargs='*',
70-
help="Directory with various cluster quality csvs")
7165
parser.add_argument(
7266
"--jaffal_csv", required=False, type=str, default=None,
7367
help="Path to JAFFAL results csv")
@@ -345,7 +339,7 @@ def grouped_bar(df, title="", tilted_xlabs=False):
345339
return p
346340

347341

348-
def gff_compare_plots(report, gffcompare_outdirs, sample_ids):
342+
def gff_compare_plots(report, gffcompare_outdirs):
349343
"""Create various sections and plots in a WfReport.
350344
351345
:param report: aplanat WFReport
@@ -383,7 +377,10 @@ def gff_compare_plots(report, gffcompare_outdirs, sample_ids):
383377

384378
tabs = []
385379
gff_fails = False
386-
for id_, dir_ in zip(sample_ids, gffcompare_outdirs):
380+
sample_ids = []
381+
for dir_ in gffcompare_outdirs:
382+
sample_id = dir_.name
383+
sample_ids.append(sample_id) # Get sample ids fromt the folder name
387384
stats, _, miss, novel, total = \
388385
parse_gffcmp_stats(dir_ / 'str_merged.stats')
389386

@@ -396,7 +393,7 @@ def gff_compare_plots(report, gffcompare_outdirs, sample_ids):
396393
tabs.append(Panel(
397394
child=gridplot(
398395
[bar_totals, bar_performance, bar_missed, bar_novel],
399-
ncols=2, width=350, height=260), title=id_))
396+
ncols=2, width=350, height=260), title=sample_id))
400397
else:
401398
gff_fails = True
402399

@@ -445,7 +442,7 @@ def gff_compare_plots(report, gffcompare_outdirs, sample_ids):
445442

446443
track_files = [x / 'str_merged.tracking' for x in gffcompare_outdirs]
447444

448-
df_tracking = load_sample_data(
445+
df_tracking = load_data_add_sample_id(
449446
track_files, sample_ids,
450447
read_func=lambda x: pd.read_csv(
451448
x, sep="\t", header=None,
@@ -531,7 +528,7 @@ def plot_isoforms_per_tpm_bin(
531528
sys.stderr("Cannot find .tmap files in {}".format(gffcompare_outdirs))
532529
return
533530

534-
df_tmap = load_sample_data(tmap_files, sample_ids)
531+
df_tmap = load_data_add_sample_id(tmap_files, sample_ids)
535532

536533
for id_, df in df_tmap.groupby('sample_id'):
537534

@@ -596,59 +593,6 @@ def pychopper_plots(report, pychop_report):
596593
section.plot(grid)
597594

598595

599-
def cluster_quality(cluster_qc_dir, report, sample_ids):
600-
"""Make cluster quality section."""
601-
section = report.add_section()
602-
section.markdown('''
603-
### De novo clustering quality
604-
605-
This section shows plots relating to the clustering quality performed
606-
by isONclust2. The full length reads are mapped to a reference genome
607-
to create a ground truth of reads mapped to clusters. This is then compared
608-
to the de novo-generated clusters, and the following statistics are
609-
generated.
610-
611-
* [Homogeneity](https://scikit-learn.org/stable/modules/generated/
612-
sklearn.metrics.homogeneity_score.html): Penalises over-clustering.
613-
614-
* [Completeness](https://scikit-learn.org/stable/modules/generated/
615-
sklearn.metrics.completeness_score.html): Penalises under-clustering.
616-
617-
* [V-measure](https://clusteringjl.readthedocs.io/en/latest/vmeasure.html):
618-
The harmonic mean of the homogeneity and completeness
619-
620-
* [Adjusted Rand Index](https://scikit-learn.org/stable/modules/generated/
621-
sklearn.metrics.adjusted_rand_score.html): Intuitively, measures the
622-
percentage of read pairs correctly clustered, normalized so that a perfect
623-
clustering = 1 and a random cluster assignment achieves = 0
624-
625-
* NonSingleton: Clusters with multiple reads
626-
* Singleton: Clusters consisting of a single read (These do not contribute
627-
to the final transcript calling - I need to check this!)
628-
629-
''')
630-
631-
tabs = []
632-
for id_, cluster_dir in zip(sample_ids, cluster_qc_dir):
633-
plots = []
634-
for fn in ['v_ari_com_hom.csv', 'sing_nonsing.csv']:
635-
# Skip the next two plots for now
636-
# 'class_sizes1.csv', 'class_sizes2.csv']:
637-
df = pd.read_csv(Path(cluster_dir) / fn)
638-
bar = bars.simple_bar(
639-
df.Statistic.values.tolist(), df.Value.values.tolist(),
640-
colors=Colors.cerulean
641-
)
642-
bar.xaxis.major_label_orientation = math.pi / 2.8
643-
plots.append(bar)
644-
tabs.append(Panel(
645-
child=gridplot(plots, ncols=4,
646-
width=300, height=300), title=id_))
647-
648-
cover_panel = Tabs(tabs=tabs)
649-
section.plot(cover_panel)
650-
651-
652596
def transcript_table(report, isoform_table, max_rows):
653597
"""Create searchable table of transcripts.
654598
@@ -685,7 +629,7 @@ def transcript_table(report, isoform_table, max_rows):
685629
section.table(df, index=False)
686630

687631

688-
def transcriptome_summary(report, gffs, sample_ids):
632+
def transcriptome_summary(report, gffs):
689633
"""
690634
Plot transcriptome summaries.
691635
@@ -694,7 +638,6 @@ def transcriptome_summary(report, gffs, sample_ids):
694638
695639
:param report: aplanat WFReport
696640
:param gffs: list of paths to gff transcriptome annotations
697-
:param sample_ids: list of sample ids
698641
"""
699642
# test.db gets written to the git repo.
700643
section = report.add_section()
@@ -703,7 +646,8 @@ def transcriptome_summary(report, gffs, sample_ids):
703646
''')
704647

705648
tabs = []
706-
for id_, gff in zip(sample_ids, gffs):
649+
for gff in gffs:
650+
sample_id = Path(gff).name
707651

708652
plots = []
709653

@@ -747,7 +691,7 @@ def transcriptome_summary(report, gffs, sample_ids):
747691
plots.append(bar_isos)
748692

749693
box = bars.boxplot_series(
750-
[id_] * len(transcript_lens), transcript_lens,
694+
[sample_id] * len(transcript_lens), transcript_lens,
751695
width=70, ylim=(min(transcript_lens), max(transcript_lens)),
752696
title='transcript lengths')
753697
plots.append(box)
@@ -780,14 +724,14 @@ def transcriptome_summary(report, gffs, sample_ids):
780724

781725
tabs.append(Panel(
782726
child=gridplot(plots, ncols=4,
783-
width=300, height=300), title=id_))
727+
width=300, height=300), title=sample_id))
784728

785729
cover_panel = Tabs(tabs=tabs)
786730
section.plot(cover_panel)
787731

788732

789-
def load_sample_data(files, sample_ids, read_func=None):
790-
"""Load CSVs into dataframe, and assign sample_id column."""
733+
def load_data_add_sample_id(files, sample_ids, read_func=None):
734+
"""Load CSVs and concat into single into dataframe, and assign sample_id column."""
791735
df_ = pd.DataFrame()
792736
if not files:
793737
return None
@@ -890,18 +834,18 @@ def de_section(report):
890834

891835
def main(args):
892836
"""Run the entry point."""
893-
sample_ids = args.sample_ids
894-
sample_ids.sort()
895-
896837
report = WFReport(
897838
"Transcript isoform report", "wf-transcriptomes",
898839
revision=args.revision, commit=args.commit)
899840

900-
# QC
901841
seq_stats_tabs(report, args.stats)
902842

903843
if args.alignment_stats is not None:
904-
df_aln_stats = load_sample_data(args.alignment_stats, sample_ids)
844+
stats_dfs = []
845+
for stats_file in args.alignment_stats:
846+
df = pd.read_csv(stats_file, sep='\t+')
847+
stats_dfs.append(df)
848+
aln_stats_df = pd.concat(stats_dfs)
905849
section = report.add_section()
906850
section.markdown('''
907851
### Read mapping summary
@@ -910,28 +854,23 @@ def main(args):
910854
[seqkit](https://bioinf.shenwei.me/seqkit/)
911855
`seqkit bam -s`''')
912856

913-
section.table(df_aln_stats)
857+
section.table(aln_stats_df)
914858

915859
if args.pychop_report is not None:
916860
pychopper_plots(report, args.pychop_report)
917861

918862
# Results
919863
if args.gff_annotation is not None:
920-
transcriptome_summary(
921-
report, args.gff_annotation, sample_ids)
864+
transcriptome_summary(report, args.gff_annotation)
922865

923866
if args.gffcompare_dir is not None:
924867
gff_compare_plots(
925868
report,
926-
[Path(x) for x in args.gffcompare_dir],
927-
sample_ids)
869+
[x for x in Path(args.gffcompare_dir).iterdir()])
928870

929871
if args.isoform_table is not None:
930872
transcript_table(report, args.isoform_table, args.isoform_table_nrows)
931873

932-
if args.cluster_qc_dirs is not None:
933-
cluster_quality(args.cluster_qc_dirs, report, sample_ids)
934-
935874
if args.de_report:
936875
de_section(report)
937876

docs/06_input_parameters.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111
| analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False |
1212

1313

14+
### Output Options
15+
16+
| Nextflow parameter name | Type | Description | Help | Default |
17+
|--------------------------|------|-------------|------|---------|
18+
| out_dir | string | Directory for output of all user-facing files. | | output |
19+
20+
1421
### Sample Options
1522

1623
| Nextflow parameter name | Type | Description | Help | Default |

0 commit comments

Comments
 (0)