feat fix_caseid and naming of drop output

genomic-medicine-sweden · Mar 1, 2024 · 35d34a3 · 35d34a3
1 parent 922d548
commit 35d34a3
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 62 deletions.
diff --git a/bin/drop_filter_results.py b/bin/drop_filter_results.py
@@ -5,12 +5,14 @@
 import pyreadr
 from pandas import DataFrame, concat, read_csv
 
-SCRIPT_VERSION = "v1.1"
+SCRIPT_VERSION = "v1.2"
 GENE_PANEL_HEADER = ["chromosome", "gene_start", "gene_stop", "hgnc_id", "hgnc_symbol"]
 GENE_PANEL_COLUMNS_TO_KEEP = ["hgnc_symbol", "hgnc_id"]
 
 
-def get_top_hits(sample_id: str, df_results_family_aberrant_expression: DataFrame) -> DataFrame:
+def get_top_hits(
+    sample_id: str, df_results_family_aberrant_expression: DataFrame
+) -> DataFrame:
     """
     Filter results to get only those with the id provided.
     If there are >= 20 hits it will output all of them.
@@ -25,7 +27,9 @@ def get_top_hits(sample_id: str, df_results_family_aberrant_expression: DataFram
     return df_id[:20]
 
 
-def annotate_with_drop_gene_name(df_family_results: DataFrame, out_drop_gene_name: str) -> DataFrame:
+def annotate_with_drop_gene_name(
+    df_family_results: DataFrame, out_drop_gene_name: str
+) -> DataFrame:
     """Annotate results from DROP with hgnc symbols."""
     df_genes: DataFrame = read_csv(out_drop_gene_name)
     common_columns = list(set(df_genes.columns) & set(df_family_results.columns))
@@ -36,27 +40,34 @@ def annotate_with_drop_gene_name(df_family_results: DataFrame, out_drop_gene_nam
             df_family_results, left_on="hgncSymbol", right_on="hgncSymbol"
         )
     else:
-        df_merged = df_genes.merge(df_family_results, left_on="geneID", right_on="geneID")
+        df_merged = df_genes.merge(
+            df_family_results, left_on="geneID", right_on="geneID"
+        )
     return df_merged
 
 
 def filter_by_gene_panel(
-    df_family_top_hits: DataFrame, gene_panel: str, module_name: str, case_id: str, output_file_subfix: str
+    df_family_top_hits: DataFrame, gene_panel: str, file_name_research: str
 ) -> DataFrame:
     """Filter out from results any gene that is not present in the provided gene panel."""
-    if case_id != "":
-        case_id += "_"
     if gene_panel != "None":
         df_panel: DataFrame = read_csv(
-            gene_panel, sep="\t", names=GENE_PANEL_HEADER, header=None, comment="#", index_col=False
+            gene_panel,
+            sep="\t",
+            names=GENE_PANEL_HEADER,
+            header=None,
+            comment="#",
+            index_col=False,
         )
-        df_family_top_hits = df_family_top_hits.loc[df_family_top_hits["hgncSymbol"].isin(df_panel["hgnc_symbol"])]
+        df_family_top_hits = df_family_top_hits.loc[
+            df_family_top_hits["hgncSymbol"].isin(df_panel["hgnc_symbol"])
+        ]
         df_clinical: DataFrame = df_panel[GENE_PANEL_COLUMNS_TO_KEEP].merge(
             df_family_top_hits, left_on="hgnc_symbol", right_on="hgncSymbol"
         )
         df_clinical = df_clinical.drop(columns=["hgnc_symbol"])
-        file_name = f"{case_id}{module_name}_{output_file_subfix}.tsv"
-        df_clinical.to_csv(file_name, sep="\t", index=False, header=True)
+        file_name_clinical = file_name_research.replace("research.tsv", "clinical.tsv")
+        df_clinical.to_csv(file_name_clinical, sep="\t", index=False, header=True)
 
 
 def filter_outrider_results(
@@ -65,7 +76,7 @@ def filter_outrider_results(
     out_drop_aberrant_expression_rds: str,
     out_drop_gene_name: str,
     case_id: str,
-    output_file_subfix_ae: str,
+    output_name_fragment_ae: str,
 ):
     """
     Filter results to get only those from the sample(s) provided.
@@ -79,26 +90,36 @@ def filter_outrider_results(
     rds_aberrant_expression = pyreadr.read_r(out_drop_aberrant_expression_rds)
     df_results_aberrante_expression: DataFrame = rds_aberrant_expression[None]
     # Keep only samples provided to tomte
-    df_results_family_aberrant_expression: DataFrame = df_results_aberrante_expression.loc[
-        df_results_aberrante_expression["sampleID"].isin(samples)
-    ]
+    df_results_family_aberrant_expression: DataFrame = (
+        df_results_aberrante_expression.loc[
+            df_results_aberrante_expression["sampleID"].isin(samples)
+        ]
+    )
     df_family_aberrant_expression_top_hits = DataFrame()
     for id in samples:
-        df_sample_aberrant_expression_top_hits = get_top_hits(id, df_results_family_aberrant_expression)
+        df_sample_aberrant_expression_top_hits = get_top_hits(
+            id, df_results_family_aberrant_expression
+        )
         df_family_aberrant_expression_top_hits = concat(
-            [df_family_aberrant_expression_top_hits, df_sample_aberrant_expression_top_hits],
+            [
+                df_family_aberrant_expression_top_hits,
+                df_sample_aberrant_expression_top_hits,
+            ],
             ignore_index=True,
             sort=False,
         )
-        df_family_aberrant_expression_top_hits = df_family_aberrant_expression_top_hits.drop(columns=["index"])
+        df_family_aberrant_expression_top_hits = (
+            df_family_aberrant_expression_top_hits.drop(columns=["index"])
+        )
     df_family_annotated_aberrant_expression_top_hits = annotate_with_drop_gene_name(
         df_family_aberrant_expression_top_hits, out_drop_gene_name
     )
+    file_name_research = f"{case_id}{output_name_fragment_ae}_research.tsv"
     df_family_annotated_aberrant_expression_top_hits.to_csv(
-        "OUTRIDER_provided_samples_top_hits.tsv", sep="\t", index=False, header=True
+        file_name_research, sep="\t", index=False, header=True
     )
     filter_by_gene_panel(
-        df_family_annotated_aberrant_expression_top_hits, gene_panel, "OUTRIDER", case_id, output_file_subfix_ae
+        df_family_annotated_aberrant_expression_top_hits, gene_panel, file_name_research
     )
 
 
@@ -108,26 +129,31 @@ def filter_fraser_result(
     out_drop_aberrant_splicing_tsv: str,
     out_drop_gene_name: str,
     case_id: str,
-    output_file_subfix_as: str,
+    output_name_fragment_as: str,
 ):
     """
     Filter results to get only those from the sample(s) provided.
     Two tsvs will be outputed:
         - One filtered to keep gense in the gene panel (if provided).
         - Another that is unfilterd.
     """
-    df_results_aberrant_splicing: DataFrame = read_csv(out_drop_aberrant_splicing_tsv, sep="\t")
+    df_results_aberrant_splicing: DataFrame = read_csv(
+        out_drop_aberrant_splicing_tsv, sep="\t"
+    )
     # Keep only samples provided to tomte
     df_results_family_aberrant_splicing: DataFrame = df_results_aberrant_splicing.loc[
         df_results_aberrant_splicing["sampleID"].isin(samples)
     ]
     df_results_family_aberrant_splicing = annotate_with_drop_gene_name(
         df_results_family_aberrant_splicing, out_drop_gene_name
     )
+    file_name_research = f"{case_id}{output_name_fragment_as}_research.tsv"
     df_results_family_aberrant_splicing.to_csv(
-        "FRASER_provided_samples_top_hits.tsv", sep="\t", index=False, header=True
+        file_name_research, sep="\t", index=False, header=True
+    )
+    filter_by_gene_panel(
+        df_results_family_aberrant_splicing, gene_panel, file_name_research
     )
-    filter_by_gene_panel(df_results_family_aberrant_splicing, gene_panel, "FRASER", case_id, output_file_subfix_as)
 
 
 def parse_args(argv=None):
@@ -172,10 +198,10 @@ def parse_args(argv=None):
         required=False,
     )
     parser.add_argument(
-        "--output_file_subfix_ae",
+        "--output_name_fragment_ae",
         type=str,
-        default="provided_samples_top_hits_filtered",
-        help="Subfix of Aberrant Expression output file",
+        default="OUTRIDER_provided_samples_top_hits",
+        help="Central fragment of Aberrant Expression output file, case will be added at the beginning and clinical/research at end",
         required=False,
     )
     parser.add_argument(
@@ -186,10 +212,10 @@ def parse_args(argv=None):
         required=False,
     )
     parser.add_argument(
-        "--output_file_subfix_as",
+        "--output_name_fragment_as",
         type=str,
-        default="provided_samples_top_hits_filtered",
-        help="Subfix of Aberrant Splicing output file",
+        default="FRASER_provided_samples_top_hits",
+        help="Central fragment of Aberrant Splicing output file, case will be added at the beginning and clinical/research at end",
         required=False,
     )
     parser.add_argument(
@@ -203,14 +229,16 @@ def parse_args(argv=None):
 def main():
     """Coordinate argument parsing and program execution."""
     args = parse_args()
+    if args.case_id != "":
+        args.case_id += "_"
     if args.drop_ae_rds != "None":
         filter_outrider_results(
             samples=args.samples,
             gene_panel=args.gene_panel,
             out_drop_aberrant_expression_rds=args.drop_ae_rds,
             out_drop_gene_name=args.out_drop_gene_name,
             case_id=args.case_id,
-            output_file_subfix_ae=args.output_file_subfix_ae,
+            output_name_fragment_ae=args.output_name_fragment_ae,
         )
     if args.out_drop_as_tsv != "None":
         filter_fraser_result(
@@ -219,7 +247,7 @@ def main():
             out_drop_aberrant_splicing_tsv=args.out_drop_as_tsv,
             out_drop_gene_name=args.out_drop_gene_name,
             case_id=args.case_id,
-            output_file_subfix_as=args.output_file_subfix_as,
+            output_name_fragment_as=args.output_name_fragment_as,
         )
 
 

diff --git a/modules/local/drop_filter_results.nf b/modules/local/drop_filter_results.nf
@@ -10,25 +10,24 @@ process DROP_FILTER_RESULTS {
     container "docker.io/clinicalgenomics/drop:1.3.3"
 
     input:
-    val(samples)
     val(case_info)
     path gene_panel_clinical_filter
     path out_drop_ae_rds_in
     path out_drop_gene_name_in
     path out_drop_as_tsv_in
 
     output:
-    path('OUTRIDER_provided_samples_top_hits.tsv')         , optional: true, emit: ae_out_unfiltered
-    path('OUTRIDER_provided_samples_top_hits_filtered.tsv'), optional: true, emit: ae_out_filtered
-    path('FRASER_provided_samples_top_hits.tsv')           , optional: true, emit: as_out_unfiltered
-    path('FRASER_provided_samples_top_hits_filtered.tsv')  , optional: true, emit: as_out_filtered
-    path "versions.yml"                                    , emit: versions
+    path('*OUTRIDER_provided_samples_top_hits_research.tsv') , optional: true, emit: ae_out_research
+    path('*OUTRIDER_provided_samples_top_hits_clinical.tsv') , optional: true, emit: ae_out_clinical
+    path('*FRASER_provided_samples_top_hits_research.tsv')   , optional: true, emit: as_out_research
+    path('*FRASER_provided_samples_top_hits_clinical.tsv')   , optional: true, emit: as_out_clinical
+    path "versions.yml"                                      , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
-    def ids = "${samples.id}".replace("[","").replace("]","").replace(",","")
+    def ids = "${case_info.probands}".replace("[","").replace("]","").replace(",","")
     def case_id = "${case_info.id}".replace("[","").replace("]","").replace(",","")
     def gene_panel_filter = gene_panel_clinical_filter ? "--gene_panel ${gene_panel_clinical_filter}" : ''
     def drop_ae_rds = out_drop_ae_rds_in ? "--drop_ae_rds ${out_drop_ae_rds_in}" : ''
@@ -43,8 +42,6 @@ process DROP_FILTER_RESULTS {
         $out_drop_gene_name \\
         $out_drop_as_tsv \\
         --case $case_id \\
-        --output_file_subfix_as "provided_samples_top_hits_filtered" \\
-        --output_file_subfix_ae "provided_samples_top_hits_filtered"
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
@@ -54,10 +51,10 @@ process DROP_FILTER_RESULTS {
 
     stub:
     """
-    touch OUTRIDER_provided_samples_top_hits.tsv
-    touch OUTRIDER_provided_samples_top_hits_filtered.tsv
-    touch FRASER_provided_samples_top_hits.tsv
-    touch FRASER_provided_samples_top_hits_filtered.tsv
+    touch OUTRIDER_provided_samples_top_hits_research.tsv
+    touch OUTRIDER_provided_samples_top_hits_clinical.tsv
+    touch FRASER_provided_samples_top_hits_research.tsv
+    touch FRASER_provided_samples_top_hits_clinical.tsv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf
@@ -86,7 +86,6 @@ workflow ANALYSE_TRANSCRIPTS {
         ch_out_drop_gene_name    = params.switch_drop_ae ? ch_out_drop_gene_name_ae : ch_out_drop_gene_name_as
 
         DROP_FILTER_RESULTS(
-            star_samples,
             case_info,
             ch_gene_panel_clinical_filter,
             ch_out_drop_ae_rds.ifEmpty([]),
@@ -116,19 +115,19 @@ workflow ANALYSE_TRANSCRIPTS {
         ch_versions = ch_versions.mix(GFFCOMPARE.out.versions.first())
 
     emit:
-        transcript_gtf        = STRINGTIE_STRINGTIE.out.transcript_gtf     // channel: [ val(meta), [ path(transctript_gtf)] ]
-        abundance             = STRINGTIE_STRINGTIE.out.abundance          // channel: [ val(meta), [ path(abundance) ] ]
-        coverage_gtf          = STRINGTIE_STRINGTIE.out.coverage_gtf       // channel: [ val(meta), [ path(coverage_gtf) ] ]
-        annotated_gtf         = GFFCOMPARE.out.annotated_gtf               // channel: [ val(meta), [ path(annotated_gtf) ] ]
-        stats_gtf             = GFFCOMPARE.out.stats                       // channel: [ val(meta), [ path(stats) ] ]
-        annotation_drop       = DROP_SAMPLE_ANNOT.out.drop_annot           // channel: [ path(sample_annotation.tsv) ]
-        config_drop_ae        = DROP_CONFIG_RUN_AE.out.config_drop         // channel: [ path(confg_file.yml) ]
-        drop_ae_out           = DROP_CONFIG_RUN_AE.out.drop_ae_out         // channel: [ path(drop_output_AE) ]
-        config_drop_as        = DROP_CONFIG_RUN_AS.out.config_drop         // channel: [ path(confg_file.yml) ]
-        drop_as_out           = DROP_CONFIG_RUN_AS.out.drop_as_out         // channel: [ path(drop_output_AS) ]
-        drop_filter_ae_res    = DROP_FILTER_RESULTS.out.ae_out_filtered    // channel: [ path(drop_AE_filtered.tsv) ]
-        drop_unfilter_ae_res  = DROP_FILTER_RESULTS.out.ae_out_unfiltered  // channel: [ path(drop_AE_unfiltered.tsv) ]
-        drop_filter_as_res    = DROP_FILTER_RESULTS.out.as_out_filtered    // channel: [ path(drop_AS_filtered.tsv) ]
-        drop_unfilter_as_res  = DROP_FILTER_RESULTS.out.as_out_unfiltered  // channel: [ path(drop_AS_unfiltered.tsv) ]
-        versions              = ch_versions                                // channel: [ path(versions.yml) ]
+        transcript_gtf        = STRINGTIE_STRINGTIE.out.transcript_gtf   // channel: [ val(meta), [ path(transctript_gtf)] ]
+        abundance             = STRINGTIE_STRINGTIE.out.abundance        // channel: [ val(meta), [ path(abundance) ] ]
+        coverage_gtf          = STRINGTIE_STRINGTIE.out.coverage_gtf     // channel: [ val(meta), [ path(coverage_gtf) ] ]
+        annotated_gtf         = GFFCOMPARE.out.annotated_gtf             // channel: [ val(meta), [ path(annotated_gtf) ] ]
+        stats_gtf             = GFFCOMPARE.out.stats                     // channel: [ val(meta), [ path(stats) ] ]
+        annotation_drop       = DROP_SAMPLE_ANNOT.out.drop_annot         // channel: [ path(sample_annotation.tsv) ]
+        config_drop_ae        = DROP_CONFIG_RUN_AE.out.config_drop       // channel: [ path(confg_file.yml) ]
+        drop_ae_out           = DROP_CONFIG_RUN_AE.out.drop_ae_out       // channel: [ path(drop_output_AE) ]
+        config_drop_as        = DROP_CONFIG_RUN_AS.out.config_drop       // channel: [ path(confg_file.yml) ]
+        drop_as_out           = DROP_CONFIG_RUN_AS.out.drop_as_out       // channel: [ path(drop_output_AS) ]
+        drop_ae_out_clinical  = DROP_FILTER_RESULTS.out.ae_out_clinical  // channel: [ path(drop_AE_clinical.tsv) ]
+        drop_ae_out_research  = DROP_FILTER_RESULTS.out.ae_out_research  // channel: [ path(drop_AE_research.tsv) ]
+        drop_as_out_clinical  = DROP_FILTER_RESULTS.out.as_out_clinical  // channel: [ path(drop_AS_clinical.tsv) ]
+        drop_as_out_research  = DROP_FILTER_RESULTS.out.as_out_research  // channel: [ path(drop_AS_research.tsv) ]
+        versions              = ch_versions                              // channel: [ path(versions.yml) ]
 }
diff --git a/workflows/tomte.nf b/workflows/tomte.nf
@@ -245,7 +245,7 @@ def create_case_channel(List rows) {
         probands.add(item.sample)
     }
 
-    case_info.probands = probands
+    case_info.probands = probands.unique()
     case_info.id = rows[0].case
 
     return case_info