@@ -51,23 +51,17 @@ def argparser():
5151 "--gff_annotation" , required = False , nargs = '+' ,
5252 help = "transcriptome annotation gff file" )
5353 parser .add_argument (
54- "--gffcompare_dir" , required = False , default = None , nargs = '*' ,
54+ "--gffcompare_dir" , required = False , default = None ,
5555 help = "gffcompare outout dir" )
5656 parser .add_argument (
5757 "--pychop_report" , required = False , default = None ,
5858 help = "TSV summary file of pychopper statistics" )
59- parser .add_argument (
60- "--sample_ids" , required = True , nargs = '+' ,
61- help = "List of sample ids" )
6259 parser .add_argument (
6360 "--isoform_table" , required = False , type = Path ,
6461 help = "Path to directory of TSV files with isoform summaries" )
6562 parser .add_argument (
6663 "--isoform_table_nrows" , required = False , type = int , default = 5000 ,
6764 help = "Maximum rows to display in isoforms table" )
68- parser .add_argument (
69- "--cluster_qc_dirs" , required = False , type = str , default = None , nargs = '*' ,
70- help = "Directory with various cluster quality csvs" )
7165 parser .add_argument (
7266 "--jaffal_csv" , required = False , type = str , default = None ,
7367 help = "Path to JAFFAL results csv" )
@@ -345,7 +339,7 @@ def grouped_bar(df, title="", tilted_xlabs=False):
345339 return p
346340
347341
348- def gff_compare_plots (report , gffcompare_outdirs , sample_ids ):
342+ def gff_compare_plots (report , gffcompare_outdirs ):
349343 """Create various sections and plots in a WfReport.
350344
351345 :param report: aplanat WFReport
@@ -383,7 +377,10 @@ def gff_compare_plots(report, gffcompare_outdirs, sample_ids):
383377
384378 tabs = []
385379 gff_fails = False
386- for id_ , dir_ in zip (sample_ids , gffcompare_outdirs ):
380+ sample_ids = []
381+ for dir_ in gffcompare_outdirs :
382+ sample_id = dir_ .name
383+ sample_ids .append (sample_id ) # Get sample ids fromt the folder name
387384 stats , _ , miss , novel , total = \
388385 parse_gffcmp_stats (dir_ / 'str_merged.stats' )
389386
@@ -396,7 +393,7 @@ def gff_compare_plots(report, gffcompare_outdirs, sample_ids):
396393 tabs .append (Panel (
397394 child = gridplot (
398395 [bar_totals , bar_performance , bar_missed , bar_novel ],
399- ncols = 2 , width = 350 , height = 260 ), title = id_ ))
396+ ncols = 2 , width = 350 , height = 260 ), title = sample_id ))
400397 else :
401398 gff_fails = True
402399
@@ -445,7 +442,7 @@ def gff_compare_plots(report, gffcompare_outdirs, sample_ids):
445442
446443 track_files = [x / 'str_merged.tracking' for x in gffcompare_outdirs ]
447444
448- df_tracking = load_sample_data (
445+ df_tracking = load_data_add_sample_id (
449446 track_files , sample_ids ,
450447 read_func = lambda x : pd .read_csv (
451448 x , sep = "\t " , header = None ,
@@ -531,7 +528,7 @@ def plot_isoforms_per_tpm_bin(
531528 sys .stderr ("Cannot find .tmap files in {}" .format (gffcompare_outdirs ))
532529 return
533530
534- df_tmap = load_sample_data (tmap_files , sample_ids )
531+ df_tmap = load_data_add_sample_id (tmap_files , sample_ids )
535532
536533 for id_ , df in df_tmap .groupby ('sample_id' ):
537534
@@ -596,59 +593,6 @@ def pychopper_plots(report, pychop_report):
596593 section .plot (grid )
597594
598595
599- def cluster_quality (cluster_qc_dir , report , sample_ids ):
600- """Make cluster quality section."""
601- section = report .add_section ()
602- section .markdown ('''
603- ### De novo clustering quality
604-
605- This section shows plots relating to the clustering quality performed
606- by isONclust2. The full length reads are mapped to a reference genome
607- to create a ground truth of reads mapped to clusters. This is then compared
608- to the de novo-generated clusters, and the following statistics are
609- generated.
610-
611- * [Homogeneity](https://scikit-learn.org/stable/modules/generated/
612- sklearn.metrics.homogeneity_score.html): Penalises over-clustering.
613-
614- * [Completeness](https://scikit-learn.org/stable/modules/generated/
615- sklearn.metrics.completeness_score.html): Penalises under-clustering.
616-
617- * [V-measure](https://clusteringjl.readthedocs.io/en/latest/vmeasure.html):
618- The harmonic mean of the homogeneity and completeness
619-
620- * [Adjusted Rand Index](https://scikit-learn.org/stable/modules/generated/
621- sklearn.metrics.adjusted_rand_score.html): Intuitively, measures the
622- percentage of read pairs correctly clustered, normalized so that a perfect
623- clustering = 1 and a random cluster assignment achieves = 0
624-
625- * NonSingleton: Clusters with multiple reads
626- * Singleton: Clusters consisting of a single read (These do not contribute
627- to the final transcript calling - I need to check this!)
628-
629- ''' )
630-
631- tabs = []
632- for id_ , cluster_dir in zip (sample_ids , cluster_qc_dir ):
633- plots = []
634- for fn in ['v_ari_com_hom.csv' , 'sing_nonsing.csv' ]:
635- # Skip the next two plots for now
636- # 'class_sizes1.csv', 'class_sizes2.csv']:
637- df = pd .read_csv (Path (cluster_dir ) / fn )
638- bar = bars .simple_bar (
639- df .Statistic .values .tolist (), df .Value .values .tolist (),
640- colors = Colors .cerulean
641- )
642- bar .xaxis .major_label_orientation = math .pi / 2.8
643- plots .append (bar )
644- tabs .append (Panel (
645- child = gridplot (plots , ncols = 4 ,
646- width = 300 , height = 300 ), title = id_ ))
647-
648- cover_panel = Tabs (tabs = tabs )
649- section .plot (cover_panel )
650-
651-
652596def transcript_table (report , isoform_table , max_rows ):
653597 """Create searchable table of transcripts.
654598
@@ -685,7 +629,7 @@ def transcript_table(report, isoform_table, max_rows):
685629 section .table (df , index = False )
686630
687631
688- def transcriptome_summary (report , gffs , sample_ids ):
632+ def transcriptome_summary (report , gffs ):
689633 """
690634 Plot transcriptome summaries.
691635
@@ -694,7 +638,6 @@ def transcriptome_summary(report, gffs, sample_ids):
694638
695639 :param report: aplanat WFReport
696640 :param gffs: list of paths to gff transcriptome annotations
697- :param sample_ids: list of sample ids
698641 """
699642 # test.db gets written to the git repo.
700643 section = report .add_section ()
@@ -703,7 +646,8 @@ def transcriptome_summary(report, gffs, sample_ids):
703646 ''' )
704647
705648 tabs = []
706- for id_ , gff in zip (sample_ids , gffs ):
649+ for gff in gffs :
650+ sample_id = Path (gff ).name
707651
708652 plots = []
709653
@@ -747,7 +691,7 @@ def transcriptome_summary(report, gffs, sample_ids):
747691 plots .append (bar_isos )
748692
749693 box = bars .boxplot_series (
750- [id_ ] * len (transcript_lens ), transcript_lens ,
694+ [sample_id ] * len (transcript_lens ), transcript_lens ,
751695 width = 70 , ylim = (min (transcript_lens ), max (transcript_lens )),
752696 title = 'transcript lengths' )
753697 plots .append (box )
@@ -780,14 +724,14 @@ def transcriptome_summary(report, gffs, sample_ids):
780724
781725 tabs .append (Panel (
782726 child = gridplot (plots , ncols = 4 ,
783- width = 300 , height = 300 ), title = id_ ))
727+ width = 300 , height = 300 ), title = sample_id ))
784728
785729 cover_panel = Tabs (tabs = tabs )
786730 section .plot (cover_panel )
787731
788732
789- def load_sample_data (files , sample_ids , read_func = None ):
790- """Load CSVs into dataframe, and assign sample_id column."""
733+ def load_data_add_sample_id (files , sample_ids , read_func = None ):
734+ """Load CSVs and concat into single into dataframe, and assign sample_id column."""
791735 df_ = pd .DataFrame ()
792736 if not files :
793737 return None
@@ -890,18 +834,18 @@ def de_section(report):
890834
891835def main (args ):
892836 """Run the entry point."""
893- sample_ids = args .sample_ids
894- sample_ids .sort ()
895-
896837 report = WFReport (
897838 "Transcript isoform report" , "wf-transcriptomes" ,
898839 revision = args .revision , commit = args .commit )
899840
900- # QC
901841 seq_stats_tabs (report , args .stats )
902842
903843 if args .alignment_stats is not None :
904- df_aln_stats = load_sample_data (args .alignment_stats , sample_ids )
844+ stats_dfs = []
845+ for stats_file in args .alignment_stats :
846+ df = pd .read_csv (stats_file , sep = '\t +' )
847+ stats_dfs .append (df )
848+ aln_stats_df = pd .concat (stats_dfs )
905849 section = report .add_section ()
906850 section .markdown ('''
907851 ### Read mapping summary
@@ -910,28 +854,23 @@ def main(args):
910854 [seqkit](https://bioinf.shenwei.me/seqkit/)
911855 `seqkit bam -s`''' )
912856
913- section .table (df_aln_stats )
857+ section .table (aln_stats_df )
914858
915859 if args .pychop_report is not None :
916860 pychopper_plots (report , args .pychop_report )
917861
918862 # Results
919863 if args .gff_annotation is not None :
920- transcriptome_summary (
921- report , args .gff_annotation , sample_ids )
864+ transcriptome_summary (report , args .gff_annotation )
922865
923866 if args .gffcompare_dir is not None :
924867 gff_compare_plots (
925868 report ,
926- [Path (x ) for x in args .gffcompare_dir ],
927- sample_ids )
869+ [x for x in Path (args .gffcompare_dir ).iterdir ()])
928870
929871 if args .isoform_table is not None :
930872 transcript_table (report , args .isoform_table , args .isoform_table_nrows )
931873
932- if args .cluster_qc_dirs is not None :
933- cluster_quality (args .cluster_qc_dirs , report , sample_ids )
934-
935874 if args .de_report :
936875 de_section (report )
937876
0 commit comments