From fe3cb62ebf64dd0a03cd5242fd9f3279e8ed6e44 Mon Sep 17 00:00:00 2001
From: caozhichongchong <caozhichongchong@gmail.com>
Date: Sun, 30 May 2021 20:52:21 -0400
Subject: [PATCH] update average genome size computing in metagenomes

---
 .DS_Store                       | Bin 8196 -> 10244 bytes
 README.md                       |  32 +++++++++++++++++++++-----------
 arg_ranker/__main__.py          |  21 +++++++++++++++++++--
 arg_ranker/bin/ARG_table.sum.py |  15 ++++++++++++++-
 4 files changed, 54 insertions(+), 14 deletions(-)
diff --git a/.DS_Store b/.DS_Store
index 8372e22e914816dd4bd31d92d44f3196fe3c5717..77d9bdee938bdc32d29f59b515040dc338bfd134 100644
GIT binary patch
delta 601
zcmZ8fF>ljA6n@Vk;NYabG;W$!kqELt3{a&-KoA2_096&Bn6|1?(Nfv*C0KQQuI)6m
zOBl+?5P4x?Lqd#EB$oaJegO*$0}DuOFmjGPpq}*fKE3zd_dUJ4llqBP0sw6E&1HZJ
zha!(ClkDiAI9j@l(pWo{Q3|u9bai!ItM>iC<NX{w1RVmPoc$CSm|#JN2L+Q;KZ<v6
z-nm|>)oz8SWH7Yahpu5WgFC%3T-)-4YV{imsdMS`7sSj(DVv*-NnW0n3k7A4%$FAS
zNWswC`(2Oq9rFRRnb)xUbdSjXOEy?*Ge`62p+yI+fW8VgtYhAqJ#%55nlF;lkc(^{
zxtye&A+36pI?ubFWi~q<p9Zb}@b=EsETFwLr^C2GWY}~Ai#h%l^?aUpZQY=D(`m3E
zV7+bK^K_?68%HgRj~I<LmD?D)dfRfk>iyq*ar|V|`n4DlP-DOsPcXCT=sl{|wYa9%
zA16%_uE0IM<}KKVV;I4E_ynKf3w(v|@Du(bqJR?Su!!@xfQxtu6MX?A1%=0AB9$au
zjvjQfl7EuMnUD}ggd4B|OHiAz;yWC_{*}(BVs1s^`BC&E#s_~=p<I5Z2=PCensA3|
SL=4^X+wj9&d2+k(L-+@LIFVrh

delta 123
zcmZn(XmOBWU|?W$DortDU;r^WfEYvza8E20o2aMA$gweCH$NlCWJY0$$w>nJ97bm5
zItpe+#*=>vly8<4_F>&TQKFMkkQt~42qd_Hgeyq<#=`H+llfHwIT#^^GE9!=nY#Ij
LSR*4)LmMLixj-4P

diff --git a/README.md b/README.md
index 7c85128..fba0ec2 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,22 @@ arg_ranker evaluates the risk of ARGs in genomes and metagenomes
 
 ## Requirement
 * python 3
-* kraken2: `conda install -c bioconda kraken2`\
-download kraken2 database: `kraken2-build --standard --db $KRAKENDB` \
-where $krakenDB is your preferred database name/location\
-* diamond: `conda install -c bioconda diamond`\
+* diamond: `conda install -c bioconda diamond`
 * blast+: `conda install -c bioconda blast`
+* For metagenomes:
+    * kraken2: `conda install -c bioconda kraken2`
+        * to compute the abundance of ARGs as copy number of ARGs per bacterial cell (recommended)
+            * download the kraken2 standard database (50 GB of disk space): `kraken2-build --standard --db $KRAKENDB` \
+            where $KRAKENDB is your preferred database name/location
+            * MicrobeCensu: `git clone https://github.com/snayfach/MicrobeCensus && cd MicrobeCensus && python setup.py install` to estimate the average genome size for metagenomes.
+        * to compute the abundance of ARGs as copy number of ARGs per 16S
+            * download the kraken2 16S database (73.2 MB of disk space): `kraken2-build --db $DBNAME --special greengenes`
 
 ## How to use it
 * put all your genomes (.fa or .fasta) and metagenomes (.fq or .fastq) into one folder ($INPUT)
-* run `arg_ranker -i $INPUT --kkdb $KRAKENDB`
+* run `arg_ranker -i $INPUT` (genomes only)
+* run `arg_ranker -i $INPUT -kkdb $KRAKENDB` (genomes/metagenomes + kraken2 standard database)
+    * or run `arg_ranker -i $INPUT -kkdb $KRAKENDB -kkdbtype 16S` (kraken2 16S database)
 * run `sh arg_ranking/script_output/arg_ranker.sh`
 
 ## Output
@@ -25,11 +32,14 @@ where $krakenDB is your preferred database name/location\
     |WEE300_all-trimmed-decont_1.fastq|4.6E-02|0.0E+00|6.8E-02|7.5E-01|1.3E-01|5.4E-04|1.0-0.0-0.5-1.7-0.3|1.0|0.0|0.5|1.7|0.3|hospital_metagenome|
     |EsCo_genome.fasta|0.0E+00|0.0E+00|0.0E+00|1.0E+00|0.0E+00|2.0E+00|0.0-0.0-0.0-2.2-0.0|0.0|0.0|0.0|2.2|0.0|E.coli_genome|
 
-1. We compute the abundance of ARGs as the copy number of ARGs divided by the 16S copy number in a sample\
-For metagenomes, the copy number of ARGs and 16S were computed as the number of reads mapped to them divided by their gene length.\
-Rank_I_per - Unassessed_per: percentage of ARGs of a risk rank\
+1. Rank_I_per - Unassessed_per: percentage of ARGs of a risk rank\
 Total_abu: total abundance of all ARGs
-2. We compute the risk of ARGs as the average abundance of ARGs of a risk rank divided the average abundance of all ARGs\
+2. For genomes, we output the copy number of ARGs detected in each genome.
+3. For metagenomes, we compute the abundance of ARGs as the copy number of ARGs divided by the bacterial cell number or 16S copy number in the same metagenome.\
+If you downloaded the kraken2 standard database, we compute the copy number of ARGs divided by the bacterial cell number.\
+If you downloaded the kraken2 16S database, we compute the copy number of ARGs divided by the 16S copy number.\
+The copy number of ARGs, 16S, and bacterial cells were computed as the number of reads mapped to them divided by their gene/genome length.
+4. We compute the risk of ARGs as the average abundance of ARGs of a risk rank divided by the average abundance of all ARGs\
 Rank_I_risk - Unassessed_risk: the risk of ARGs of a risk rank\
 Rank_code: a code of ARG risk from Rank I to Unassessed
 
@@ -37,9 +47,9 @@ Rank_code: a code of ARG risk from Rank I to Unassessed
 The abundance, the gene family, and the antibiotic of resistance of ARGs detected in the input samples
 
 ## Test
-run `arg_ranker -i example --kkdb $KRAKENDB`\
+run `arg_ranker -i example -kkdb $KRAKENDB`\
 run `sh arg_ranking/script_output/arg_ranker.sh`\
-The arg_ranking/Sample_ranking_results.txt should look like Table 1
+The arg_ranking/Sample_ranking_results.txt should look like Table 1 (using kraken2 standard database)
 
 ## Metadata for your samples (optional)
 arg_ranker can merge your sample metadata into the results of ARG ranking (i.e. note1 in Table 1).\
diff --git a/arg_ranker/__main__.py b/arg_ranker/__main__.py
index 1c09ba6..47fe266 100644
--- a/arg_ranker/__main__.py
+++ b/arg_ranker/__main__.py
@@ -54,6 +54,10 @@ def main():
                           help="Optional: complete path to blast if not in PATH",
                           metavar="/usr/local/bin/blast",
                           action='store', default='blast', type=str)
+    optional.add_argument('-mc',
+                          help="Optional: complete path to MicrobeCensus if not in PATH",
+                          metavar="/usr/local/bin/run_microbe_census.py",
+                          action='store', default='run_microbe_census.py', type=str)
     optional.add_argument('-kk',
                           help="Optional: complete path to kraken2 if not in PATH",
                           metavar="/usr/local/bin/kraken2",
@@ -62,6 +66,11 @@ def main():
                           help="Optional: complete path to kraken2 database if not in PATH",
                           metavar="/usr/local/bin/kraken2/krakendb",
                           action='store', default='krakendb', type=str)
+    optional.add_argument('-kkdbtype',
+                          help="Optional: type of kraken2 database (default = standard)",
+                          choice = ['standard','16S'],
+                          metavar="standard or 16S",
+                          action='store', default='standard', type=str)
 
     ################################################## Definition ########################################################
     args = parser.parse_args()
@@ -191,11 +200,19 @@ def searchARG(allsamples):
                         workingdir, inputfasta, samplename, search_output)
                 # compute taxonomy
                 try:
-                    sampleoutput = os.path.join(search_output, samplename + '.kraken')
+                    sampleoutput = os.path.join(search_output, samplename + '.kraken.kreport')
                     f1 = open(sampleoutput, 'r')
                 except IOError:
                     cmds += '%skraken2 --db %s %s --output %s --report %s.kreport --threads %s\n' % (
                         split_string_last(args.kk, 'kraken'), args.kkdb, sample, sampleoutput, sampleoutput, args.t)
+                # compute average genome size
+                if args.kkdbtype != '16S':
+                    try:
+                        sampleoutput = os.path.join(search_output, samplename + '.AGS.txt')
+                        f1 = open(sampleoutput, 'r')
+                    except IOError:
+                        cmds += '%run_microbe_census.py -t %s %s %s \n' % (
+                            split_string_last(args.mc, 'run_microbe_census'),args.t, sample, sampleoutput)
                 sample = os.path.join(search_output, samplename + '.diamond.txt.aa')
             # run blast
             sampleoutput = os.path.join(search_output, samplename + '.blast.txt')
@@ -297,7 +314,7 @@ def ranking_arg(inputfasta,Metadata):
         # search ARGs in all samples
         cmds = searchARG(allsamples)
         # output summary table
-        cmds += 'python %s/bin/ARG_table.sum.py -i %s -d %s\n' % (workingdir,search_output,ARGdatabase_mapping)
+        cmds += 'python %s/bin/ARG_table.sum.py -i %s -d %s -kkdbtype %s\n' % (workingdir,search_output,ARGdatabase_mapping,args.kkdbtype)
         # risk ranking of ARGs
         cmds += 'arg_ranker -i %s/Sample_ARGpresence.txt -m %s -o %s\n'%(output_dir,Metadata,output_dir)
         f1 = open('%s/arg_ranker.sh'%(scripts_output),'w')
diff --git a/arg_ranker/bin/ARG_table.sum.py b/arg_ranker/bin/ARG_table.sum.py
index 204bdae..1654163 100755
--- a/arg_ranker/bin/ARG_table.sum.py
+++ b/arg_ranker/bin/ARG_table.sum.py
@@ -10,6 +10,11 @@
 parser.add_argument("-d",
                     help="database mapping file",
                      type=str, default='SARG.structure.txt ',metavar='SARG.structure.txt')
+optional.add_argument('-kkdbtype',
+                          help="Optional: type of kraken2 database (default = standard)",
+                          choice = ['standard','16S'],
+                          metavar="standard or 16S",
+                          action='store', default='standard', type=str)
 
 ################################################## Definition ########################################################
 args = parser.parse_args()
@@ -41,6 +46,7 @@ def sum_ARG(allsearchoutput):
         # load kraken results
         kraken = glob.glob(searchoutput.replace('.blast.txt.filter','.kraken.kreport'))
         copy_16S = 1
+        gene_length = 1550 # 16S
         if kraken!= []:
             # metagenomes
             for lines in open(kraken[0],'r'):
@@ -48,6 +54,13 @@ def sum_ARG(allsearchoutput):
                 if 'Bacteria' in lines:
                     copy_16S = float(lines_set[1])*2 # pair end
                     break
+        if args.kkdbtype != '16S':
+            # load average genome size
+            AGS_result = glob.glob(searchoutput.replace('.blast.txt.filter', '.AGS.txt'))[0]
+            for lines in open(AGS_result,'r'):
+                if lines.startswith('average_genome_size'):
+                    lines_set = lines.split('\n')[0].split('\t')
+                    gene_length = float(lines_set[1])
         # load ARG blast results
         samplename = os.path.split(searchoutput)[-1].split('.blast.txt.filter')[0]
         allsamplename.append(samplename)
@@ -58,7 +71,7 @@ def sum_ARG(allsearchoutput):
             if copy_16S == 1: # genomes
                 sampleARG[ARG] += 1.0 / copy_16S
             else: # metagenomes
-                sampleARG[ARG] += 1.0/copy_16S/ARG_length[ARG]*1550 # normalize against ARG length and 16S length
+                sampleARG[ARG] += 1.0 / copy_16S / ARG_length[ARG] * gene_length  # normalize against ARG length and 16S/genome length
         allsampleARG.append(sampleARG)
     # sum ARG blast results
     alloutput = []