From fe3cb62ebf64dd0a03cd5242fd9f3279e8ed6e44 Mon Sep 17 00:00:00 2001 From: caozhichongchong Date: Sun, 30 May 2021 20:52:21 -0400 Subject: [PATCH] update average genome size computing in metagenomes --- .DS_Store | Bin 8196 -> 10244 bytes README.md | 32 +++++++++++++++++++++----------- arg_ranker/__main__.py | 21 +++++++++++++++++++-- arg_ranker/bin/ARG_table.sum.py | 15 ++++++++++++++- 4 files changed, 54 insertions(+), 14 deletions(-) diff --git a/.DS_Store b/.DS_Store index 8372e22e914816dd4bd31d92d44f3196fe3c5717..77d9bdee938bdc32d29f59b515040dc338bfd134 100644 GIT binary patch delta 601 zcmZ8fF>ljA6n@Vk;NYabG;W$!kqELt3{a&-KoA2_096&Bn6|1?(Nfv*C0KQQuI)6m zOBl+?5P4x?Lqd#EB$oaJegO*$0}DuOFmjGPpq}*fKE3zd_dUJ4llqBP0sw6E&1HZJ zha!(ClkDiAI9j@l(pWo{Q3|u9bai!ItM>iC)oz8SWH7Yahpu5WgFC%3T-)-4YV{imsdMS`7sSj(DVv*-NnW0n3k7A4%$FAS zNWswC`(2Oq9rFRRnb)xUbdSjXOEy?*Ge`62p+yI+fW8VgtYhAqJ#%55nlF;lkc(^{ zxtye&A+36pI?ubFWi~qiyq*ar|V|`n4DlP-DOsPcXCT=sl{|wYa9% zA16%_uE0IM<}KKVV;I4E_ynKf3w(v|@Du(bqJR?Su!!@xfQxtu6MX?A1%=0AB9$au zjvjQfl7EuMnUD}ggd4B|OHiAz;yWC_{*}(BVs1s^`BC&E#s_~=pnJ97bm5 zItpe+#*=>vly8<4_F>&TQKFMkkQt~42qd_Hgeyq<#=`H+llfHwIT#^^GE9!=nY#Ij LSR*4)LmMLixj-4P diff --git a/README.md b/README.md index 7c85128..fba0ec2 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,22 @@ arg_ranker evaluates the risk of ARGs in genomes and metagenomes ## Requirement * python 3 -* kraken2: `conda install -c bioconda kraken2`\ -download kraken2 database: `kraken2-build --standard --db $KRAKENDB` \ -where $krakenDB is your preferred database name/location\ -* diamond: `conda install -c bioconda diamond`\ +* diamond: `conda install -c bioconda diamond` * blast+: `conda install -c bioconda blast` +* For metagenomes: + * kraken2: `conda install -c bioconda kraken2` + * to compute the abundance of ARGs as copy number of ARGs per bacterial cell (recommended) + * download the kraken2 standard database (50 GB of disk space): `kraken2-build --standard --db $KRAKENDB` \ + where $KRAKENDB is your preferred database name/location + * MicrobeCensu: `git clone https://github.com/snayfach/MicrobeCensus && cd MicrobeCensus && python setup.py install` to estimate the average genome size for metagenomes. + * to compute the abundance of ARGs as copy number of ARGs per 16S + * download the kraken2 16S database (73.2 MB of disk space): `kraken2-build --db $DBNAME --special greengenes` ## How to use it * put all your genomes (.fa or .fasta) and metagenomes (.fq or .fastq) into one folder ($INPUT) -* run `arg_ranker -i $INPUT --kkdb $KRAKENDB` +* run `arg_ranker -i $INPUT` (genomes only) +* run `arg_ranker -i $INPUT -kkdb $KRAKENDB` (genomes/metagenomes + kraken2 standard database) + * or run `arg_ranker -i $INPUT -kkdb $KRAKENDB -kkdbtype 16S` (kraken2 16S database) * run `sh arg_ranking/script_output/arg_ranker.sh` ## Output @@ -25,11 +32,14 @@ where $krakenDB is your preferred database name/location\ |WEE300_all-trimmed-decont_1.fastq|4.6E-02|0.0E+00|6.8E-02|7.5E-01|1.3E-01|5.4E-04|1.0-0.0-0.5-1.7-0.3|1.0|0.0|0.5|1.7|0.3|hospital_metagenome| |EsCo_genome.fasta|0.0E+00|0.0E+00|0.0E+00|1.0E+00|0.0E+00|2.0E+00|0.0-0.0-0.0-2.2-0.0|0.0|0.0|0.0|2.2|0.0|E.coli_genome| -1. We compute the abundance of ARGs as the copy number of ARGs divided by the 16S copy number in a sample\ -For metagenomes, the copy number of ARGs and 16S were computed as the number of reads mapped to them divided by their gene length.\ -Rank_I_per - Unassessed_per: percentage of ARGs of a risk rank\ +1. Rank_I_per - Unassessed_per: percentage of ARGs of a risk rank\ Total_abu: total abundance of all ARGs -2. We compute the risk of ARGs as the average abundance of ARGs of a risk rank divided the average abundance of all ARGs\ +2. For genomes, we output the copy number of ARGs detected in each genome. +3. For metagenomes, we compute the abundance of ARGs as the copy number of ARGs divided by the bacterial cell number or 16S copy number in the same metagenome.\ +If you downloaded the kraken2 standard database, we compute the copy number of ARGs divided by the bacterial cell number.\ +If you downloaded the kraken2 16S database, we compute the copy number of ARGs divided by the 16S copy number.\ +The copy number of ARGs, 16S, and bacterial cells were computed as the number of reads mapped to them divided by their gene/genome length. +4. We compute the risk of ARGs as the average abundance of ARGs of a risk rank divided by the average abundance of all ARGs\ Rank_I_risk - Unassessed_risk: the risk of ARGs of a risk rank\ Rank_code: a code of ARG risk from Rank I to Unassessed @@ -37,9 +47,9 @@ Rank_code: a code of ARG risk from Rank I to Unassessed The abundance, the gene family, and the antibiotic of resistance of ARGs detected in the input samples ## Test -run `arg_ranker -i example --kkdb $KRAKENDB`\ +run `arg_ranker -i example -kkdb $KRAKENDB`\ run `sh arg_ranking/script_output/arg_ranker.sh`\ -The arg_ranking/Sample_ranking_results.txt should look like Table 1 +The arg_ranking/Sample_ranking_results.txt should look like Table 1 (using kraken2 standard database) ## Metadata for your samples (optional) arg_ranker can merge your sample metadata into the results of ARG ranking (i.e. note1 in Table 1).\ diff --git a/arg_ranker/__main__.py b/arg_ranker/__main__.py index 1c09ba6..47fe266 100644 --- a/arg_ranker/__main__.py +++ b/arg_ranker/__main__.py @@ -54,6 +54,10 @@ def main(): help="Optional: complete path to blast if not in PATH", metavar="/usr/local/bin/blast", action='store', default='blast', type=str) + optional.add_argument('-mc', + help="Optional: complete path to MicrobeCensus if not in PATH", + metavar="/usr/local/bin/run_microbe_census.py", + action='store', default='run_microbe_census.py', type=str) optional.add_argument('-kk', help="Optional: complete path to kraken2 if not in PATH", metavar="/usr/local/bin/kraken2", @@ -62,6 +66,11 @@ def main(): help="Optional: complete path to kraken2 database if not in PATH", metavar="/usr/local/bin/kraken2/krakendb", action='store', default='krakendb', type=str) + optional.add_argument('-kkdbtype', + help="Optional: type of kraken2 database (default = standard)", + choice = ['standard','16S'], + metavar="standard or 16S", + action='store', default='standard', type=str) ################################################## Definition ######################################################## args = parser.parse_args() @@ -191,11 +200,19 @@ def searchARG(allsamples): workingdir, inputfasta, samplename, search_output) # compute taxonomy try: - sampleoutput = os.path.join(search_output, samplename + '.kraken') + sampleoutput = os.path.join(search_output, samplename + '.kraken.kreport') f1 = open(sampleoutput, 'r') except IOError: cmds += '%skraken2 --db %s %s --output %s --report %s.kreport --threads %s\n' % ( split_string_last(args.kk, 'kraken'), args.kkdb, sample, sampleoutput, sampleoutput, args.t) + # compute average genome size + if args.kkdbtype != '16S': + try: + sampleoutput = os.path.join(search_output, samplename + '.AGS.txt') + f1 = open(sampleoutput, 'r') + except IOError: + cmds += '%run_microbe_census.py -t %s %s %s \n' % ( + split_string_last(args.mc, 'run_microbe_census'),args.t, sample, sampleoutput) sample = os.path.join(search_output, samplename + '.diamond.txt.aa') # run blast sampleoutput = os.path.join(search_output, samplename + '.blast.txt') @@ -297,7 +314,7 @@ def ranking_arg(inputfasta,Metadata): # search ARGs in all samples cmds = searchARG(allsamples) # output summary table - cmds += 'python %s/bin/ARG_table.sum.py -i %s -d %s\n' % (workingdir,search_output,ARGdatabase_mapping) + cmds += 'python %s/bin/ARG_table.sum.py -i %s -d %s -kkdbtype %s\n' % (workingdir,search_output,ARGdatabase_mapping,args.kkdbtype) # risk ranking of ARGs cmds += 'arg_ranker -i %s/Sample_ARGpresence.txt -m %s -o %s\n'%(output_dir,Metadata,output_dir) f1 = open('%s/arg_ranker.sh'%(scripts_output),'w') diff --git a/arg_ranker/bin/ARG_table.sum.py b/arg_ranker/bin/ARG_table.sum.py index 204bdae..1654163 100755 --- a/arg_ranker/bin/ARG_table.sum.py +++ b/arg_ranker/bin/ARG_table.sum.py @@ -10,6 +10,11 @@ parser.add_argument("-d", help="database mapping file", type=str, default='SARG.structure.txt ',metavar='SARG.structure.txt') +optional.add_argument('-kkdbtype', + help="Optional: type of kraken2 database (default = standard)", + choice = ['standard','16S'], + metavar="standard or 16S", + action='store', default='standard', type=str) ################################################## Definition ######################################################## args = parser.parse_args() @@ -41,6 +46,7 @@ def sum_ARG(allsearchoutput): # load kraken results kraken = glob.glob(searchoutput.replace('.blast.txt.filter','.kraken.kreport')) copy_16S = 1 + gene_length = 1550 # 16S if kraken!= []: # metagenomes for lines in open(kraken[0],'r'): @@ -48,6 +54,13 @@ def sum_ARG(allsearchoutput): if 'Bacteria' in lines: copy_16S = float(lines_set[1])*2 # pair end break + if args.kkdbtype != '16S': + # load average genome size + AGS_result = glob.glob(searchoutput.replace('.blast.txt.filter', '.AGS.txt'))[0] + for lines in open(AGS_result,'r'): + if lines.startswith('average_genome_size'): + lines_set = lines.split('\n')[0].split('\t') + gene_length = float(lines_set[1]) # load ARG blast results samplename = os.path.split(searchoutput)[-1].split('.blast.txt.filter')[0] allsamplename.append(samplename) @@ -58,7 +71,7 @@ def sum_ARG(allsearchoutput): if copy_16S == 1: # genomes sampleARG[ARG] += 1.0 / copy_16S else: # metagenomes - sampleARG[ARG] += 1.0/copy_16S/ARG_length[ARG]*1550 # normalize against ARG length and 16S length + sampleARG[ARG] += 1.0 / copy_16S / ARG_length[ARG] * gene_length # normalize against ARG length and 16S/genome length allsampleARG.append(sampleARG) # sum ARG blast results alloutput = []