Skip to content

Commit

Permalink
feat fix_caseid and naming of drop output
Browse files Browse the repository at this point in the history
  • Loading branch information
lucia.pena.perez@scilifelab.se committed Mar 1, 2024
1 parent 922d548 commit 35d34a3
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 62 deletions.
92 changes: 60 additions & 32 deletions bin/drop_filter_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
import pyreadr
from pandas import DataFrame, concat, read_csv

SCRIPT_VERSION = "v1.1"
SCRIPT_VERSION = "v1.2"
GENE_PANEL_HEADER = ["chromosome", "gene_start", "gene_stop", "hgnc_id", "hgnc_symbol"]
GENE_PANEL_COLUMNS_TO_KEEP = ["hgnc_symbol", "hgnc_id"]


def get_top_hits(sample_id: str, df_results_family_aberrant_expression: DataFrame) -> DataFrame:
def get_top_hits(
sample_id: str, df_results_family_aberrant_expression: DataFrame
) -> DataFrame:
"""
Filter results to get only those with the id provided.
If there are >= 20 hits it will output all of them.
Expand All @@ -25,7 +27,9 @@ def get_top_hits(sample_id: str, df_results_family_aberrant_expression: DataFram
return df_id[:20]


def annotate_with_drop_gene_name(df_family_results: DataFrame, out_drop_gene_name: str) -> DataFrame:
def annotate_with_drop_gene_name(
df_family_results: DataFrame, out_drop_gene_name: str
) -> DataFrame:
"""Annotate results from DROP with hgnc symbols."""
df_genes: DataFrame = read_csv(out_drop_gene_name)
common_columns = list(set(df_genes.columns) & set(df_family_results.columns))
Expand All @@ -36,27 +40,34 @@ def annotate_with_drop_gene_name(df_family_results: DataFrame, out_drop_gene_nam
df_family_results, left_on="hgncSymbol", right_on="hgncSymbol"
)
else:
df_merged = df_genes.merge(df_family_results, left_on="geneID", right_on="geneID")
df_merged = df_genes.merge(
df_family_results, left_on="geneID", right_on="geneID"
)
return df_merged


def filter_by_gene_panel(
df_family_top_hits: DataFrame, gene_panel: str, module_name: str, case_id: str, output_file_subfix: str
df_family_top_hits: DataFrame, gene_panel: str, file_name_research: str
) -> DataFrame:
"""Filter out from results any gene that is not present in the provided gene panel."""
if case_id != "":
case_id += "_"
if gene_panel != "None":
df_panel: DataFrame = read_csv(
gene_panel, sep="\t", names=GENE_PANEL_HEADER, header=None, comment="#", index_col=False
gene_panel,
sep="\t",
names=GENE_PANEL_HEADER,
header=None,
comment="#",
index_col=False,
)
df_family_top_hits = df_family_top_hits.loc[df_family_top_hits["hgncSymbol"].isin(df_panel["hgnc_symbol"])]
df_family_top_hits = df_family_top_hits.loc[
df_family_top_hits["hgncSymbol"].isin(df_panel["hgnc_symbol"])
]
df_clinical: DataFrame = df_panel[GENE_PANEL_COLUMNS_TO_KEEP].merge(
df_family_top_hits, left_on="hgnc_symbol", right_on="hgncSymbol"
)
df_clinical = df_clinical.drop(columns=["hgnc_symbol"])
file_name = f"{case_id}{module_name}_{output_file_subfix}.tsv"
df_clinical.to_csv(file_name, sep="\t", index=False, header=True)
file_name_clinical = file_name_research.replace("research.tsv", "clinical.tsv")
df_clinical.to_csv(file_name_clinical, sep="\t", index=False, header=True)


def filter_outrider_results(
Expand All @@ -65,7 +76,7 @@ def filter_outrider_results(
out_drop_aberrant_expression_rds: str,
out_drop_gene_name: str,
case_id: str,
output_file_subfix_ae: str,
output_name_fragment_ae: str,
):
"""
Filter results to get only those from the sample(s) provided.
Expand All @@ -79,26 +90,36 @@ def filter_outrider_results(
rds_aberrant_expression = pyreadr.read_r(out_drop_aberrant_expression_rds)
df_results_aberrante_expression: DataFrame = rds_aberrant_expression[None]
# Keep only samples provided to tomte
df_results_family_aberrant_expression: DataFrame = df_results_aberrante_expression.loc[
df_results_aberrante_expression["sampleID"].isin(samples)
]
df_results_family_aberrant_expression: DataFrame = (
df_results_aberrante_expression.loc[
df_results_aberrante_expression["sampleID"].isin(samples)
]
)
df_family_aberrant_expression_top_hits = DataFrame()
for id in samples:
df_sample_aberrant_expression_top_hits = get_top_hits(id, df_results_family_aberrant_expression)
df_sample_aberrant_expression_top_hits = get_top_hits(
id, df_results_family_aberrant_expression
)
df_family_aberrant_expression_top_hits = concat(
[df_family_aberrant_expression_top_hits, df_sample_aberrant_expression_top_hits],
[
df_family_aberrant_expression_top_hits,
df_sample_aberrant_expression_top_hits,
],
ignore_index=True,
sort=False,
)
df_family_aberrant_expression_top_hits = df_family_aberrant_expression_top_hits.drop(columns=["index"])
df_family_aberrant_expression_top_hits = (
df_family_aberrant_expression_top_hits.drop(columns=["index"])
)
df_family_annotated_aberrant_expression_top_hits = annotate_with_drop_gene_name(
df_family_aberrant_expression_top_hits, out_drop_gene_name
)
file_name_research = f"{case_id}{output_name_fragment_ae}_research.tsv"
df_family_annotated_aberrant_expression_top_hits.to_csv(
"OUTRIDER_provided_samples_top_hits.tsv", sep="\t", index=False, header=True
file_name_research, sep="\t", index=False, header=True
)
filter_by_gene_panel(
df_family_annotated_aberrant_expression_top_hits, gene_panel, "OUTRIDER", case_id, output_file_subfix_ae
df_family_annotated_aberrant_expression_top_hits, gene_panel, file_name_research
)


Expand All @@ -108,26 +129,31 @@ def filter_fraser_result(
out_drop_aberrant_splicing_tsv: str,
out_drop_gene_name: str,
case_id: str,
output_file_subfix_as: str,
output_name_fragment_as: str,
):
"""
Filter results to get only those from the sample(s) provided.
Two tsvs will be outputed:
- One filtered to keep gense in the gene panel (if provided).
- Another that is unfilterd.
"""
df_results_aberrant_splicing: DataFrame = read_csv(out_drop_aberrant_splicing_tsv, sep="\t")
df_results_aberrant_splicing: DataFrame = read_csv(
out_drop_aberrant_splicing_tsv, sep="\t"
)
# Keep only samples provided to tomte
df_results_family_aberrant_splicing: DataFrame = df_results_aberrant_splicing.loc[
df_results_aberrant_splicing["sampleID"].isin(samples)
]
df_results_family_aberrant_splicing = annotate_with_drop_gene_name(
df_results_family_aberrant_splicing, out_drop_gene_name
)
file_name_research = f"{case_id}{output_name_fragment_as}_research.tsv"
df_results_family_aberrant_splicing.to_csv(
"FRASER_provided_samples_top_hits.tsv", sep="\t", index=False, header=True
file_name_research, sep="\t", index=False, header=True
)
filter_by_gene_panel(
df_results_family_aberrant_splicing, gene_panel, file_name_research
)
filter_by_gene_panel(df_results_family_aberrant_splicing, gene_panel, "FRASER", case_id, output_file_subfix_as)


def parse_args(argv=None):
Expand Down Expand Up @@ -172,10 +198,10 @@ def parse_args(argv=None):
required=False,
)
parser.add_argument(
"--output_file_subfix_ae",
"--output_name_fragment_ae",
type=str,
default="provided_samples_top_hits_filtered",
help="Subfix of Aberrant Expression output file",
default="OUTRIDER_provided_samples_top_hits",
help="Central fragment of Aberrant Expression output file, case will be added at the beginning and clinical/research at end",
required=False,
)
parser.add_argument(
Expand All @@ -186,10 +212,10 @@ def parse_args(argv=None):
required=False,
)
parser.add_argument(
"--output_file_subfix_as",
"--output_name_fragment_as",
type=str,
default="provided_samples_top_hits_filtered",
help="Subfix of Aberrant Splicing output file",
default="FRASER_provided_samples_top_hits",
help="Central fragment of Aberrant Splicing output file, case will be added at the beginning and clinical/research at end",
required=False,
)
parser.add_argument(
Expand All @@ -203,14 +229,16 @@ def parse_args(argv=None):
def main():
"""Coordinate argument parsing and program execution."""
args = parse_args()
if args.case_id != "":
args.case_id += "_"
if args.drop_ae_rds != "None":
filter_outrider_results(
samples=args.samples,
gene_panel=args.gene_panel,
out_drop_aberrant_expression_rds=args.drop_ae_rds,
out_drop_gene_name=args.out_drop_gene_name,
case_id=args.case_id,
output_file_subfix_ae=args.output_file_subfix_ae,
output_name_fragment_ae=args.output_name_fragment_ae,
)
if args.out_drop_as_tsv != "None":
filter_fraser_result(
Expand All @@ -219,7 +247,7 @@ def main():
out_drop_aberrant_splicing_tsv=args.out_drop_as_tsv,
out_drop_gene_name=args.out_drop_gene_name,
case_id=args.case_id,
output_file_subfix_as=args.output_file_subfix_as,
output_name_fragment_as=args.output_name_fragment_as,
)


Expand Down
23 changes: 10 additions & 13 deletions modules/local/drop_filter_results.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,24 @@ process DROP_FILTER_RESULTS {
container "docker.io/clinicalgenomics/drop:1.3.3"

input:
val(samples)
val(case_info)
path gene_panel_clinical_filter
path out_drop_ae_rds_in
path out_drop_gene_name_in
path out_drop_as_tsv_in

output:
path('OUTRIDER_provided_samples_top_hits.tsv') , optional: true, emit: ae_out_unfiltered
path('OUTRIDER_provided_samples_top_hits_filtered.tsv'), optional: true, emit: ae_out_filtered
path('FRASER_provided_samples_top_hits.tsv') , optional: true, emit: as_out_unfiltered
path('FRASER_provided_samples_top_hits_filtered.tsv') , optional: true, emit: as_out_filtered
path "versions.yml" , emit: versions
path('*OUTRIDER_provided_samples_top_hits_research.tsv') , optional: true, emit: ae_out_research
path('*OUTRIDER_provided_samples_top_hits_clinical.tsv') , optional: true, emit: ae_out_clinical
path('*FRASER_provided_samples_top_hits_research.tsv') , optional: true, emit: as_out_research
path('*FRASER_provided_samples_top_hits_clinical.tsv') , optional: true, emit: as_out_clinical
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def ids = "${samples.id}".replace("[","").replace("]","").replace(",","")
def ids = "${case_info.probands}".replace("[","").replace("]","").replace(",","")
def case_id = "${case_info.id}".replace("[","").replace("]","").replace(",","")
def gene_panel_filter = gene_panel_clinical_filter ? "--gene_panel ${gene_panel_clinical_filter}" : ''
def drop_ae_rds = out_drop_ae_rds_in ? "--drop_ae_rds ${out_drop_ae_rds_in}" : ''
Expand All @@ -43,8 +42,6 @@ process DROP_FILTER_RESULTS {
$out_drop_gene_name \\
$out_drop_as_tsv \\
--case $case_id \\
--output_file_subfix_as "provided_samples_top_hits_filtered" \\
--output_file_subfix_ae "provided_samples_top_hits_filtered"
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand All @@ -54,10 +51,10 @@ process DROP_FILTER_RESULTS {

stub:
"""
touch OUTRIDER_provided_samples_top_hits.tsv
touch OUTRIDER_provided_samples_top_hits_filtered.tsv
touch FRASER_provided_samples_top_hits.tsv
touch FRASER_provided_samples_top_hits_filtered.tsv
touch OUTRIDER_provided_samples_top_hits_research.tsv
touch OUTRIDER_provided_samples_top_hits_clinical.tsv
touch FRASER_provided_samples_top_hits_research.tsv
touch FRASER_provided_samples_top_hits_clinical.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
31 changes: 15 additions & 16 deletions subworkflows/local/analyse_transcripts.nf
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ workflow ANALYSE_TRANSCRIPTS {
ch_out_drop_gene_name = params.switch_drop_ae ? ch_out_drop_gene_name_ae : ch_out_drop_gene_name_as

DROP_FILTER_RESULTS(
star_samples,
case_info,
ch_gene_panel_clinical_filter,
ch_out_drop_ae_rds.ifEmpty([]),
Expand Down Expand Up @@ -116,19 +115,19 @@ workflow ANALYSE_TRANSCRIPTS {
ch_versions = ch_versions.mix(GFFCOMPARE.out.versions.first())

emit:
transcript_gtf = STRINGTIE_STRINGTIE.out.transcript_gtf // channel: [ val(meta), [ path(transctript_gtf)] ]
abundance = STRINGTIE_STRINGTIE.out.abundance // channel: [ val(meta), [ path(abundance) ] ]
coverage_gtf = STRINGTIE_STRINGTIE.out.coverage_gtf // channel: [ val(meta), [ path(coverage_gtf) ] ]
annotated_gtf = GFFCOMPARE.out.annotated_gtf // channel: [ val(meta), [ path(annotated_gtf) ] ]
stats_gtf = GFFCOMPARE.out.stats // channel: [ val(meta), [ path(stats) ] ]
annotation_drop = DROP_SAMPLE_ANNOT.out.drop_annot // channel: [ path(sample_annotation.tsv) ]
config_drop_ae = DROP_CONFIG_RUN_AE.out.config_drop // channel: [ path(confg_file.yml) ]
drop_ae_out = DROP_CONFIG_RUN_AE.out.drop_ae_out // channel: [ path(drop_output_AE) ]
config_drop_as = DROP_CONFIG_RUN_AS.out.config_drop // channel: [ path(confg_file.yml) ]
drop_as_out = DROP_CONFIG_RUN_AS.out.drop_as_out // channel: [ path(drop_output_AS) ]
drop_filter_ae_res = DROP_FILTER_RESULTS.out.ae_out_filtered // channel: [ path(drop_AE_filtered.tsv) ]
drop_unfilter_ae_res = DROP_FILTER_RESULTS.out.ae_out_unfiltered // channel: [ path(drop_AE_unfiltered.tsv) ]
drop_filter_as_res = DROP_FILTER_RESULTS.out.as_out_filtered // channel: [ path(drop_AS_filtered.tsv) ]
drop_unfilter_as_res = DROP_FILTER_RESULTS.out.as_out_unfiltered // channel: [ path(drop_AS_unfiltered.tsv) ]
versions = ch_versions // channel: [ path(versions.yml) ]
transcript_gtf = STRINGTIE_STRINGTIE.out.transcript_gtf // channel: [ val(meta), [ path(transctript_gtf)] ]
abundance = STRINGTIE_STRINGTIE.out.abundance // channel: [ val(meta), [ path(abundance) ] ]
coverage_gtf = STRINGTIE_STRINGTIE.out.coverage_gtf // channel: [ val(meta), [ path(coverage_gtf) ] ]
annotated_gtf = GFFCOMPARE.out.annotated_gtf // channel: [ val(meta), [ path(annotated_gtf) ] ]
stats_gtf = GFFCOMPARE.out.stats // channel: [ val(meta), [ path(stats) ] ]
annotation_drop = DROP_SAMPLE_ANNOT.out.drop_annot // channel: [ path(sample_annotation.tsv) ]
config_drop_ae = DROP_CONFIG_RUN_AE.out.config_drop // channel: [ path(confg_file.yml) ]
drop_ae_out = DROP_CONFIG_RUN_AE.out.drop_ae_out // channel: [ path(drop_output_AE) ]
config_drop_as = DROP_CONFIG_RUN_AS.out.config_drop // channel: [ path(confg_file.yml) ]
drop_as_out = DROP_CONFIG_RUN_AS.out.drop_as_out // channel: [ path(drop_output_AS) ]
drop_ae_out_clinical = DROP_FILTER_RESULTS.out.ae_out_clinical // channel: [ path(drop_AE_clinical.tsv) ]
drop_ae_out_research = DROP_FILTER_RESULTS.out.ae_out_research // channel: [ path(drop_AE_research.tsv) ]
drop_as_out_clinical = DROP_FILTER_RESULTS.out.as_out_clinical // channel: [ path(drop_AS_clinical.tsv) ]
drop_as_out_research = DROP_FILTER_RESULTS.out.as_out_research // channel: [ path(drop_AS_research.tsv) ]
versions = ch_versions // channel: [ path(versions.yml) ]
}
2 changes: 1 addition & 1 deletion workflows/tomte.nf
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def create_case_channel(List rows) {
probands.add(item.sample)
}

case_info.probands = probands
case_info.probands = probands.unique()
case_info.id = rows[0].case

return case_info
Expand Down

0 comments on commit 35d34a3

Please sign in to comment.