turn bash script into python

genomic-medicine-sweden · Oct 16, 2024 · ea64e43 · ea64e43
1 parent 6ab7468
commit ea64e43
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 21 deletions.
diff --git a/bin/drop_sample_annot_exported_counts.py b/bin/drop_sample_annot_exported_counts.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+from pandas import read_csv, DataFrame
+
+SCRIPT_VERSION = "v1.0"
+
+
+def modify_gene_counts_df(df: DataFrame, col_name: str, run: bool, value_in: str):
+    """Modifies column col_name in df if run is true to make all
+    rows equal value_in. If run is false it will make all rows NA"""
+    if run:
+        df[col_name] = value_in
+    else:
+        df[col_name] = "NA"
+    return df
+
+
+def modify_and_write_sample_annotation(
+    sample_annot: str, ae_run: bool, as_run: bool, gtf: str
+):
+    """
+    Modifies and writes Sample Annotation produced by DROP to make one
+    that can be used as input for Tomte
+    """
+    df_samples: DataFrame = read_csv(sample_annot, sep="\t")
+    df_samples["RNA_BAM_FILE"] = "NA"
+    df_samples["GENE_ANNOTATION"] = df_samples["GENE_ANNOTATION"].fillna(gtf)
+    df_samples = modify_gene_counts_df(
+        df=df_samples,
+        col_name="GENE_COUNTS_FILE",
+        run=ae_run,
+        value_in="exported_counts/geneCounts.tsv.gz",
+    )
+    df_samples = modify_gene_counts_df(
+        df=df_samples,
+        col_name="SPLICE_COUNTS_DIR",
+        run=as_run,
+        value_in="exported_counts",
+    )
+    df_samples.to_csv("exported_counts/sampleAnnotation.tsv", index=False, sep="\t")
+
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.MetavarTypeHelpFormatter,
+        description="""Generate DROP sample annotation for exported db.""",
+    )
+    parser.add_argument(
+        "--sample_annot",
+        type=str,
+        help="original sample annotation in export_counts folder",
+        required=True,
+    )
+    parser.add_argument(
+        "--ae_run",
+        type=bool,
+        help="Was aberrant expression run?",
+        required=True,
+    )
+    parser.add_argument(
+        "--as_run",
+        type=bool,
+        help="Was aberrant splicing run?",
+        required=True,
+    )
+    parser.add_argument(
+        "--gtf",
+        type=str,
+        help="Specify gtf file name used to run",
+        required=True,
+    )
+    parser.add_argument("--version", action="version", version=SCRIPT_VERSION)
+    return parser.parse_args(argv)
+
+
+def main():
+    """Coordinate argument parsing and program execution."""
+    args = parse_args()
+    modify_and_write_sample_annotation(
+        sample_annot=args.sample_annot,
+        ae_run=args.ae_run,
+        as_run=args.as_run,
+        gtf=args.gtf,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/local/drop_put_together_exported_couts.nf b/modules/local/drop_put_together_exported_couts.nf
@@ -27,7 +27,7 @@ process DROP_PUT_TOGETHER_EXPORTED_COUNTS {
     def gtf_no_extension = gtf.baseName
 
     """
-    #!/bin/bash
+
     mkdir -p exported_counts
     if [[ "$ae_run" == "true" ]];then
         cp ${exported_counts_ae}/* exported_counts/.
@@ -37,29 +37,17 @@ process DROP_PUT_TOGETHER_EXPORTED_COUNTS {
         cp ${exported_counts_as}/* exported_counts/.
     fi
 
-    cd exported_counts/
-
-    awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="RNA_BAM_FILE") rna_col=i} NR>1 {\$rna_col="NA"} 1' OFS='\t' sample_annotation.tsv > sampleAnnotation.tsv
-    awk -v gene_annot="${gtf_no_extension}" -F'\t' 'NR==1 {for (i=1; i<=NF; i++) if (\$i=="GENE_ANNOTATION") rna_col=i} NR>1 {if (\$rna_col == "NA") \$rna_col = gene_annot} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
-
-    if [[ "$as_run" == "true" && "$ae_run" == "true" ]]; then
-        awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="SPLICE_COUNTS_DIR") rna_col=i} NR>1 {\$rna_col="exported_counts"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
-        awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="GENE_COUNTS_FILE") rna_col=i} NR>1 {\$rna_col="exported_counts/geneCounts.tsv.gz"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
-    elif [[ "$as_run" == "true" && "$ae_run" == "false" ]]; then
-        awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="SPLICE_COUNTS_DIR") rna_col=i} NR>1 {\$rna_col="exported_counts"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
-        awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="GENE_COUNTS_FILE") rna_col=i} NR>1 {\$rna_col="NA"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
-    elif [[ "$as_run" == "false" && "$ae_run" == "true" ]]; then
-        awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="SPLICE_COUNTS_DIR") rna_col=i} NR>1 {\$rna_col="NA"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
-        awk -F'\t' 'NR==1 {for(i=1; i<=NF; i++) if(\$i=="GENE_COUNTS_FILE") rna_col=i} NR>1 {\$rna_col="exported_counts/geneCounts.tsv.gz"} 1' OFS='\t' sampleAnnotation.tsv > tmpfile && mv tmpfile sampleAnnotation.tsv
-    fi
-
-    rm sample_annotation.tsv
+    mv exported_counts/sample_annotation.tsv .
 
-    cd ..
+    $baseDir/bin/drop_sample_annot_exported_counts.py \\
+        --sample_annot "sample_annotation.tsv" \\
+        --ae_run $ae_run \\
+        --as_run $as_run \\
+        --gtf $gtf_no_extension
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
+        drop_sample_annotation_exported_counts: \$(\$baseDir/bin/drop_sample_annotation_exported_counts.py --version )
     END_VERSIONS
 
     """
@@ -70,7 +58,7 @@ process DROP_PUT_TOGETHER_EXPORTED_COUNTS {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//')
+        drop_sample_annotation_exported_counts: \$(\$baseDir/bin/drop_sample_annotation_exported_counts.py --version )
     END_VERSIONS
     """
 }