From 35d34a3fa661bc501e0ce6a7c129b7b0c092d3d9 Mon Sep 17 00:00:00 2001 From: "lucia.pena.perez@scilifelab.se" Date: Fri, 1 Mar 2024 09:20:31 +0100 Subject: [PATCH 1/2] feat fix_caseid and naming of drop output --- bin/drop_filter_results.py | 92 +++++++++++++++-------- modules/local/drop_filter_results.nf | 23 +++--- subworkflows/local/analyse_transcripts.nf | 31 ++++---- workflows/tomte.nf | 2 +- 4 files changed, 86 insertions(+), 62 deletions(-) diff --git a/bin/drop_filter_results.py b/bin/drop_filter_results.py index 75290d6a..d6ffbd45 100755 --- a/bin/drop_filter_results.py +++ b/bin/drop_filter_results.py @@ -5,12 +5,14 @@ import pyreadr from pandas import DataFrame, concat, read_csv -SCRIPT_VERSION = "v1.1" +SCRIPT_VERSION = "v1.2" GENE_PANEL_HEADER = ["chromosome", "gene_start", "gene_stop", "hgnc_id", "hgnc_symbol"] GENE_PANEL_COLUMNS_TO_KEEP = ["hgnc_symbol", "hgnc_id"] -def get_top_hits(sample_id: str, df_results_family_aberrant_expression: DataFrame) -> DataFrame: +def get_top_hits( + sample_id: str, df_results_family_aberrant_expression: DataFrame +) -> DataFrame: """ Filter results to get only those with the id provided. If there are >= 20 hits it will output all of them. @@ -25,7 +27,9 @@ def get_top_hits(sample_id: str, df_results_family_aberrant_expression: DataFram return df_id[:20] -def annotate_with_drop_gene_name(df_family_results: DataFrame, out_drop_gene_name: str) -> DataFrame: +def annotate_with_drop_gene_name( + df_family_results: DataFrame, out_drop_gene_name: str +) -> DataFrame: """Annotate results from DROP with hgnc symbols.""" df_genes: DataFrame = read_csv(out_drop_gene_name) common_columns = list(set(df_genes.columns) & set(df_family_results.columns)) @@ -36,27 +40,34 @@ def annotate_with_drop_gene_name(df_family_results: DataFrame, out_drop_gene_nam df_family_results, left_on="hgncSymbol", right_on="hgncSymbol" ) else: - df_merged = df_genes.merge(df_family_results, left_on="geneID", right_on="geneID") + df_merged = df_genes.merge( + df_family_results, left_on="geneID", right_on="geneID" + ) return df_merged def filter_by_gene_panel( - df_family_top_hits: DataFrame, gene_panel: str, module_name: str, case_id: str, output_file_subfix: str + df_family_top_hits: DataFrame, gene_panel: str, file_name_research: str ) -> DataFrame: """Filter out from results any gene that is not present in the provided gene panel.""" - if case_id != "": - case_id += "_" if gene_panel != "None": df_panel: DataFrame = read_csv( - gene_panel, sep="\t", names=GENE_PANEL_HEADER, header=None, comment="#", index_col=False + gene_panel, + sep="\t", + names=GENE_PANEL_HEADER, + header=None, + comment="#", + index_col=False, ) - df_family_top_hits = df_family_top_hits.loc[df_family_top_hits["hgncSymbol"].isin(df_panel["hgnc_symbol"])] + df_family_top_hits = df_family_top_hits.loc[ + df_family_top_hits["hgncSymbol"].isin(df_panel["hgnc_symbol"]) + ] df_clinical: DataFrame = df_panel[GENE_PANEL_COLUMNS_TO_KEEP].merge( df_family_top_hits, left_on="hgnc_symbol", right_on="hgncSymbol" ) df_clinical = df_clinical.drop(columns=["hgnc_symbol"]) - file_name = f"{case_id}{module_name}_{output_file_subfix}.tsv" - df_clinical.to_csv(file_name, sep="\t", index=False, header=True) + file_name_clinical = file_name_research.replace("research.tsv", "clinical.tsv") + df_clinical.to_csv(file_name_clinical, sep="\t", index=False, header=True) def filter_outrider_results( @@ -65,7 +76,7 @@ def filter_outrider_results( out_drop_aberrant_expression_rds: str, out_drop_gene_name: str, case_id: str, - output_file_subfix_ae: str, + output_name_fragment_ae: str, ): """ Filter results to get only those from the sample(s) provided. @@ -79,26 +90,36 @@ def filter_outrider_results( rds_aberrant_expression = pyreadr.read_r(out_drop_aberrant_expression_rds) df_results_aberrante_expression: DataFrame = rds_aberrant_expression[None] # Keep only samples provided to tomte - df_results_family_aberrant_expression: DataFrame = df_results_aberrante_expression.loc[ - df_results_aberrante_expression["sampleID"].isin(samples) - ] + df_results_family_aberrant_expression: DataFrame = ( + df_results_aberrante_expression.loc[ + df_results_aberrante_expression["sampleID"].isin(samples) + ] + ) df_family_aberrant_expression_top_hits = DataFrame() for id in samples: - df_sample_aberrant_expression_top_hits = get_top_hits(id, df_results_family_aberrant_expression) + df_sample_aberrant_expression_top_hits = get_top_hits( + id, df_results_family_aberrant_expression + ) df_family_aberrant_expression_top_hits = concat( - [df_family_aberrant_expression_top_hits, df_sample_aberrant_expression_top_hits], + [ + df_family_aberrant_expression_top_hits, + df_sample_aberrant_expression_top_hits, + ], ignore_index=True, sort=False, ) - df_family_aberrant_expression_top_hits = df_family_aberrant_expression_top_hits.drop(columns=["index"]) + df_family_aberrant_expression_top_hits = ( + df_family_aberrant_expression_top_hits.drop(columns=["index"]) + ) df_family_annotated_aberrant_expression_top_hits = annotate_with_drop_gene_name( df_family_aberrant_expression_top_hits, out_drop_gene_name ) + file_name_research = f"{case_id}{output_name_fragment_ae}_research.tsv" df_family_annotated_aberrant_expression_top_hits.to_csv( - "OUTRIDER_provided_samples_top_hits.tsv", sep="\t", index=False, header=True + file_name_research, sep="\t", index=False, header=True ) filter_by_gene_panel( - df_family_annotated_aberrant_expression_top_hits, gene_panel, "OUTRIDER", case_id, output_file_subfix_ae + df_family_annotated_aberrant_expression_top_hits, gene_panel, file_name_research ) @@ -108,7 +129,7 @@ def filter_fraser_result( out_drop_aberrant_splicing_tsv: str, out_drop_gene_name: str, case_id: str, - output_file_subfix_as: str, + output_name_fragment_as: str, ): """ Filter results to get only those from the sample(s) provided. @@ -116,7 +137,9 @@ def filter_fraser_result( - One filtered to keep gense in the gene panel (if provided). - Another that is unfilterd. """ - df_results_aberrant_splicing: DataFrame = read_csv(out_drop_aberrant_splicing_tsv, sep="\t") + df_results_aberrant_splicing: DataFrame = read_csv( + out_drop_aberrant_splicing_tsv, sep="\t" + ) # Keep only samples provided to tomte df_results_family_aberrant_splicing: DataFrame = df_results_aberrant_splicing.loc[ df_results_aberrant_splicing["sampleID"].isin(samples) @@ -124,10 +147,13 @@ def filter_fraser_result( df_results_family_aberrant_splicing = annotate_with_drop_gene_name( df_results_family_aberrant_splicing, out_drop_gene_name ) + file_name_research = f"{case_id}{output_name_fragment_as}_research.tsv" df_results_family_aberrant_splicing.to_csv( - "FRASER_provided_samples_top_hits.tsv", sep="\t", index=False, header=True + file_name_research, sep="\t", index=False, header=True + ) + filter_by_gene_panel( + df_results_family_aberrant_splicing, gene_panel, file_name_research ) - filter_by_gene_panel(df_results_family_aberrant_splicing, gene_panel, "FRASER", case_id, output_file_subfix_as) def parse_args(argv=None): @@ -172,10 +198,10 @@ def parse_args(argv=None): required=False, ) parser.add_argument( - "--output_file_subfix_ae", + "--output_name_fragment_ae", type=str, - default="provided_samples_top_hits_filtered", - help="Subfix of Aberrant Expression output file", + default="OUTRIDER_provided_samples_top_hits", + help="Central fragment of Aberrant Expression output file, case will be added at the beginning and clinical/research at end", required=False, ) parser.add_argument( @@ -186,10 +212,10 @@ def parse_args(argv=None): required=False, ) parser.add_argument( - "--output_file_subfix_as", + "--output_name_fragment_as", type=str, - default="provided_samples_top_hits_filtered", - help="Subfix of Aberrant Splicing output file", + default="FRASER_provided_samples_top_hits", + help="Central fragment of Aberrant Splicing output file, case will be added at the beginning and clinical/research at end", required=False, ) parser.add_argument( @@ -203,6 +229,8 @@ def parse_args(argv=None): def main(): """Coordinate argument parsing and program execution.""" args = parse_args() + if args.case_id != "": + args.case_id += "_" if args.drop_ae_rds != "None": filter_outrider_results( samples=args.samples, @@ -210,7 +238,7 @@ def main(): out_drop_aberrant_expression_rds=args.drop_ae_rds, out_drop_gene_name=args.out_drop_gene_name, case_id=args.case_id, - output_file_subfix_ae=args.output_file_subfix_ae, + output_name_fragment_ae=args.output_name_fragment_ae, ) if args.out_drop_as_tsv != "None": filter_fraser_result( @@ -219,7 +247,7 @@ def main(): out_drop_aberrant_splicing_tsv=args.out_drop_as_tsv, out_drop_gene_name=args.out_drop_gene_name, case_id=args.case_id, - output_file_subfix_as=args.output_file_subfix_as, + output_name_fragment_as=args.output_name_fragment_as, ) diff --git a/modules/local/drop_filter_results.nf b/modules/local/drop_filter_results.nf index deeedda0..a5a99d36 100644 --- a/modules/local/drop_filter_results.nf +++ b/modules/local/drop_filter_results.nf @@ -10,7 +10,6 @@ process DROP_FILTER_RESULTS { container "docker.io/clinicalgenomics/drop:1.3.3" input: - val(samples) val(case_info) path gene_panel_clinical_filter path out_drop_ae_rds_in @@ -18,17 +17,17 @@ process DROP_FILTER_RESULTS { path out_drop_as_tsv_in output: - path('OUTRIDER_provided_samples_top_hits.tsv') , optional: true, emit: ae_out_unfiltered - path('OUTRIDER_provided_samples_top_hits_filtered.tsv'), optional: true, emit: ae_out_filtered - path('FRASER_provided_samples_top_hits.tsv') , optional: true, emit: as_out_unfiltered - path('FRASER_provided_samples_top_hits_filtered.tsv') , optional: true, emit: as_out_filtered - path "versions.yml" , emit: versions + path('*OUTRIDER_provided_samples_top_hits_research.tsv') , optional: true, emit: ae_out_research + path('*OUTRIDER_provided_samples_top_hits_clinical.tsv') , optional: true, emit: ae_out_clinical + path('*FRASER_provided_samples_top_hits_research.tsv') , optional: true, emit: as_out_research + path('*FRASER_provided_samples_top_hits_clinical.tsv') , optional: true, emit: as_out_clinical + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def ids = "${samples.id}".replace("[","").replace("]","").replace(",","") + def ids = "${case_info.probands}".replace("[","").replace("]","").replace(",","") def case_id = "${case_info.id}".replace("[","").replace("]","").replace(",","") def gene_panel_filter = gene_panel_clinical_filter ? "--gene_panel ${gene_panel_clinical_filter}" : '' def drop_ae_rds = out_drop_ae_rds_in ? "--drop_ae_rds ${out_drop_ae_rds_in}" : '' @@ -43,8 +42,6 @@ process DROP_FILTER_RESULTS { $out_drop_gene_name \\ $out_drop_as_tsv \\ --case $case_id \\ - --output_file_subfix_as "provided_samples_top_hits_filtered" \\ - --output_file_subfix_ae "provided_samples_top_hits_filtered" cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -54,10 +51,10 @@ process DROP_FILTER_RESULTS { stub: """ - touch OUTRIDER_provided_samples_top_hits.tsv - touch OUTRIDER_provided_samples_top_hits_filtered.tsv - touch FRASER_provided_samples_top_hits.tsv - touch FRASER_provided_samples_top_hits_filtered.tsv + touch OUTRIDER_provided_samples_top_hits_research.tsv + touch OUTRIDER_provided_samples_top_hits_clinical.tsv + touch FRASER_provided_samples_top_hits_research.tsv + touch FRASER_provided_samples_top_hits_clinical.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf index c6460e18..6035e4ca 100644 --- a/subworkflows/local/analyse_transcripts.nf +++ b/subworkflows/local/analyse_transcripts.nf @@ -86,7 +86,6 @@ workflow ANALYSE_TRANSCRIPTS { ch_out_drop_gene_name = params.switch_drop_ae ? ch_out_drop_gene_name_ae : ch_out_drop_gene_name_as DROP_FILTER_RESULTS( - star_samples, case_info, ch_gene_panel_clinical_filter, ch_out_drop_ae_rds.ifEmpty([]), @@ -116,19 +115,19 @@ workflow ANALYSE_TRANSCRIPTS { ch_versions = ch_versions.mix(GFFCOMPARE.out.versions.first()) emit: - transcript_gtf = STRINGTIE_STRINGTIE.out.transcript_gtf // channel: [ val(meta), [ path(transctript_gtf)] ] - abundance = STRINGTIE_STRINGTIE.out.abundance // channel: [ val(meta), [ path(abundance) ] ] - coverage_gtf = STRINGTIE_STRINGTIE.out.coverage_gtf // channel: [ val(meta), [ path(coverage_gtf) ] ] - annotated_gtf = GFFCOMPARE.out.annotated_gtf // channel: [ val(meta), [ path(annotated_gtf) ] ] - stats_gtf = GFFCOMPARE.out.stats // channel: [ val(meta), [ path(stats) ] ] - annotation_drop = DROP_SAMPLE_ANNOT.out.drop_annot // channel: [ path(sample_annotation.tsv) ] - config_drop_ae = DROP_CONFIG_RUN_AE.out.config_drop // channel: [ path(confg_file.yml) ] - drop_ae_out = DROP_CONFIG_RUN_AE.out.drop_ae_out // channel: [ path(drop_output_AE) ] - config_drop_as = DROP_CONFIG_RUN_AS.out.config_drop // channel: [ path(confg_file.yml) ] - drop_as_out = DROP_CONFIG_RUN_AS.out.drop_as_out // channel: [ path(drop_output_AS) ] - drop_filter_ae_res = DROP_FILTER_RESULTS.out.ae_out_filtered // channel: [ path(drop_AE_filtered.tsv) ] - drop_unfilter_ae_res = DROP_FILTER_RESULTS.out.ae_out_unfiltered // channel: [ path(drop_AE_unfiltered.tsv) ] - drop_filter_as_res = DROP_FILTER_RESULTS.out.as_out_filtered // channel: [ path(drop_AS_filtered.tsv) ] - drop_unfilter_as_res = DROP_FILTER_RESULTS.out.as_out_unfiltered // channel: [ path(drop_AS_unfiltered.tsv) ] - versions = ch_versions // channel: [ path(versions.yml) ] + transcript_gtf = STRINGTIE_STRINGTIE.out.transcript_gtf // channel: [ val(meta), [ path(transctript_gtf)] ] + abundance = STRINGTIE_STRINGTIE.out.abundance // channel: [ val(meta), [ path(abundance) ] ] + coverage_gtf = STRINGTIE_STRINGTIE.out.coverage_gtf // channel: [ val(meta), [ path(coverage_gtf) ] ] + annotated_gtf = GFFCOMPARE.out.annotated_gtf // channel: [ val(meta), [ path(annotated_gtf) ] ] + stats_gtf = GFFCOMPARE.out.stats // channel: [ val(meta), [ path(stats) ] ] + annotation_drop = DROP_SAMPLE_ANNOT.out.drop_annot // channel: [ path(sample_annotation.tsv) ] + config_drop_ae = DROP_CONFIG_RUN_AE.out.config_drop // channel: [ path(confg_file.yml) ] + drop_ae_out = DROP_CONFIG_RUN_AE.out.drop_ae_out // channel: [ path(drop_output_AE) ] + config_drop_as = DROP_CONFIG_RUN_AS.out.config_drop // channel: [ path(confg_file.yml) ] + drop_as_out = DROP_CONFIG_RUN_AS.out.drop_as_out // channel: [ path(drop_output_AS) ] + drop_ae_out_clinical = DROP_FILTER_RESULTS.out.ae_out_clinical // channel: [ path(drop_AE_clinical.tsv) ] + drop_ae_out_research = DROP_FILTER_RESULTS.out.ae_out_research // channel: [ path(drop_AE_research.tsv) ] + drop_as_out_clinical = DROP_FILTER_RESULTS.out.as_out_clinical // channel: [ path(drop_AS_clinical.tsv) ] + drop_as_out_research = DROP_FILTER_RESULTS.out.as_out_research // channel: [ path(drop_AS_research.tsv) ] + versions = ch_versions // channel: [ path(versions.yml) ] } diff --git a/workflows/tomte.nf b/workflows/tomte.nf index 669b9500..8d37c9d8 100644 --- a/workflows/tomte.nf +++ b/workflows/tomte.nf @@ -245,7 +245,7 @@ def create_case_channel(List rows) { probands.add(item.sample) } - case_info.probands = probands + case_info.probands = probands.unique() case_info.id = rows[0].case return case_info From e7320e3f63f0d06774c1d717a9932c30543665dc Mon Sep 17 00:00:00 2001 From: "lucia.pena.perez@scilifelab.se" Date: Fri, 1 Mar 2024 09:35:59 +0100 Subject: [PATCH 2/2] feat update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14e8d356..50fa154a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Initial release of genomic-medicine-sweden/tomte, created with the [nf-core](htt - Renamed the other switches (subsample_region_switch, downsample_switch, run_drop_ae_switch and run_drop_as_switch) so that they all start with switch\* (switch_subsample_region, switch_downsample, switch_drop_ae and switch_drop_as) - Separated modules.config into smaller configs +- Patch tools update [#71](https://github.com/genomic-medicine-sweden/tomte/pull/71) ### `Dependencies`