Skip to content

Commit

Permalink
turn bash script into python
Browse files Browse the repository at this point in the history
  • Loading branch information
Lucpen committed Oct 16, 2024
1 parent 6ab7468 commit ea64e43
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 21 deletions.
91 changes: 91 additions & 0 deletions bin/drop_sample_annot_exported_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python3

import argparse
import csv
from pandas import read_csv, DataFrame

SCRIPT_VERSION = "v1.0"


def modify_gene_counts_df(df: DataFrame, col_name: str, run: bool, value_in: str):
"""Modifies column col_name in df if run is true to make all
rows equal value_in. If run is false it will make all rows NA"""
if run:
df[col_name] = value_in
else:
df[col_name] = "NA"
return df


def modify_and_write_sample_annotation(
sample_annot: str, ae_run: bool, as_run: bool, gtf: str
):
"""
Modifies and writes Sample Annotation produced by DROP to make one
that can be used as input for Tomte
"""
df_samples: DataFrame = read_csv(sample_annot, sep="\t")
df_samples["RNA_BAM_FILE"] = "NA"
df_samples["GENE_ANNOTATION"] = df_samples["GENE_ANNOTATION"].fillna(gtf)
df_samples = modify_gene_counts_df(
df=df_samples,
col_name="GENE_COUNTS_FILE",
run=ae_run,
value_in="exported_counts/geneCounts.tsv.gz",
)
df_samples = modify_gene_counts_df(
df=df_samples,
col_name="SPLICE_COUNTS_DIR",
run=as_run,
value_in="exported_counts",
)
df_samples.to_csv("exported_counts/sampleAnnotation.tsv", index=False, sep="\t")


def parse_args(argv=None):
"""Define and immediately parse command line arguments."""
parser = argparse.ArgumentParser(
formatter_class=argparse.MetavarTypeHelpFormatter,
description="""Generate DROP sample annotation for exported db.""",
)
parser.add_argument(
"--sample_annot",
type=str,
help="original sample annotation in export_counts folder",
required=True,
)
parser.add_argument(
"--ae_run",
type=bool,
help="Was aberrant expression run?",
required=True,
)
parser.add_argument(
"--as_run",
type=bool,
help="Was aberrant splicing run?",
required=True,
)
parser.add_argument(
"--gtf",
type=str,
help="Specify gtf file name used to run",
required=True,
)
parser.add_argument("--version", action="version", version=SCRIPT_VERSION)
return parser.parse_args(argv)


def main():
"""Coordinate argument parsing and program execution."""
args = parse_args()
modify_and_write_sample_annotation(
sample_annot=args.sample_annot,
ae_run=args.ae_run,
as_run=args.as_run,
gtf=args.gtf,
)


if __name__ == "__main__":
main()
30 changes: 9 additions & 21 deletions modules/local/drop_put_together_exported_couts.nf
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ process DROP_PUT_TOGETHER_EXPORTED_COUNTS {
def gtf_no_extension = gtf.baseName

"""
#!/bin/bash
mkdir -p exported_counts
if [[ "$ae_run" == "true" ]];then
cp ${exported_counts_ae}/* exported_counts/.
Expand All @@ -37,29 +37,17 @@ process DROP_PUT_TOGETHER_EXPORTED_COUNTS {
cp ${exported_counts_as}/* exported_counts/.
fi
cd exported_counts/
awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="RNA_BAM_FILE") rna_col=i} NR>1 {\$rna_col="NA"} 1' OFS='\t' sample_annotation.tsv > sampleAnnotation.tsv
awk -v gene_annot="${gtf_no_extension}" -F'\t' 'NR==1 {for (i=1; i<=NF; i++) if (\$i=="GENE_ANNOTATION") rna_col=i} NR>1 {if (\$rna_col == "NA") \$rna_col = gene_annot} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
if [[ "$as_run" == "true" && "$ae_run" == "true" ]]; then
awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="SPLICE_COUNTS_DIR") rna_col=i} NR>1 {\$rna_col="exported_counts"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="GENE_COUNTS_FILE") rna_col=i} NR>1 {\$rna_col="exported_counts/geneCounts.tsv.gz"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
elif [[ "$as_run" == "true" && "$ae_run" == "false" ]]; then
awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="SPLICE_COUNTS_DIR") rna_col=i} NR>1 {\$rna_col="exported_counts"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="GENE_COUNTS_FILE") rna_col=i} NR>1 {\$rna_col="NA"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
elif [[ "$as_run" == "false" && "$ae_run" == "true" ]]; then
awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="SPLICE_COUNTS_DIR") rna_col=i} NR>1 {\$rna_col="NA"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="GENE_COUNTS_FILE") rna_col=i} NR>1 {\$rna_col="exported_counts/geneCounts.tsv.gz"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
fi
rm sample_annotation.tsv
mv exported_counts/sample_annotation.tsv .
cd ..
$baseDir/bin/drop_sample_annot_exported_counts.py \\
--sample_annot "sample_annotation.tsv" \\
--ae_run $ae_run \\
--as_run $as_run \\
--gtf $gtf_no_extension
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
drop_sample_annotation_exported_counts: \$(\$baseDir/bin/drop_sample_annotation_exported_counts.py --version )
END_VERSIONS
"""
Expand All @@ -70,7 +58,7 @@ process DROP_PUT_TOGETHER_EXPORTED_COUNTS {
cat <<-END_VERSIONS > versions.yml
"${task.process}":
gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
drop_sample_annotation_exported_counts: \$(\$baseDir/bin/drop_sample_annotation_exported_counts.py --version )
END_VERSIONS
"""
}

0 comments on commit ea64e43

Please sign in to comment.