chmccarthy · hyphaltip · Sep 21, 2020
diff --git a/Pangloss.py b/Pangloss.py
@@ -86,7 +86,7 @@
 import sys
 import multiprocessing as mp
 from Bio.Data.CodonTable import TranslationError
-from ConfigParser import SafeConfigParser
+from configparser import SafeConfigParser
 from datetime import datetime
 from argparse import ArgumentParser
 from glob import glob
@@ -105,7 +105,7 @@ def PanGuessHandler(ex_path, gm_path, tp_path, tl_path,
         gm_path      = GeneMark-ES path.
         tp_path      = TransDecoder.Predict path.
         tl_path      = TransDecoder.LongOrfs path.
-    
+
     Arguments taken from Gene_model_prediction section of config file as follows:
         genomelist   = List of strain genomes specified by genomes_list.
         workdir      = Working directory for prediction given by prediction_dir.
@@ -127,7 +127,7 @@ def PanGuessHandler(ex_path, gm_path, tp_path, tl_path,
     # Generate list of genomes from user-provided genome list file.
     logging.info("Master: Parsing genome list.")
     genomes = [line.strip("\n") for line in open(genomelist)]
-    
+
     # Create working directory if not present.
     logging.info("Master: Building working directory for gene model prediction.")
     PanGuess.MakeWorkingDir(workdir)
@@ -136,7 +136,7 @@ def PanGuessHandler(ex_path, gm_path, tp_path, tl_path,
     if not skip:
         logging.info("Master: Building working directory for gene model prediction.")
         PanGuess.BuildRefSet(workdir, ref)
-    
+
     # Loop over each genome and carry out gene model prediction.
     for genome in genomes:
         # Make tag from genome name (assuming genome name is in the format STRAIN.fna).
@@ -150,27 +150,27 @@ def PanGuessHandler(ex_path, gm_path, tp_path, tl_path,
             # Run prediction using Exonerate.
             cmds = PanGuess.BuildExonerateCmds(workdir, ex_path, genome)
             exonerate_genes = PanGuess.RunExonerate(cmds, cores)
-        
+
             # Order gene models predicted via Exonerate by Contig ID: Location.
             logging.info("Master: Sorting gene model predictions by genomic location.")
             exonerate_genes.sort(key=lambda x: (x.contig_id, x.locs[0]))
-        
+
             # Extract genomic attributes from Exonerate gene model set.
             exonerate_attributes = PanGuess.GetExonerateAttributes(exonerate_genes, tag)
 
         else:
             logging.info("Master: Skipping gene model prediction via Exonerate (--no_exonerate enabled).")
             exonerate_genes = None
             exonerate_attributes = None
-        
+
         # Run prediction using GeneMark-ES.
         logging.info("Master: Running gene model prediction for {0} using GeneMark-ES.".format(genome))
         genemark_gtf = PanGuess.RunGeneMark(genome, gm_path, gm_branch, cores)
-        
+
         # Convert GeneMark-ES GTF file into a more PanOCT-compatible version.
         logging.info("Master: Converting GeneMark GTF data to attribute data.")
         genemark_attributes = PanGuess.GeneMarkGTFConverter(genemark_gtf, tag)
-        
+
         # Merge unique gene model calls between Exonerate and GeneMark-ES.
         if not skip:
             logging.info("Master: Merging Exonerate and GeneMark-ES gene calls.")
@@ -182,31 +182,31 @@ def PanGuessHandler(ex_path, gm_path, tp_path, tl_path,
         # Clean up GeneMark-ES files and folders.
         logging.info("Master: Tidying up GeneMark-ES temporary files.")
         PanGuess.MoveGeneMarkFiles(workdir, genome)
-        
+
         # Extract NCRs into list.
         logging.info("Master: Extracting non-coding regions of {0} for TransDecoder analysis.".format(genome))
         noncoding = PanGuess.ExtractNCR(merged_attributes, genome)
-        
+
         # Run TransDecoder on NCRs.
         logging.info("Master: Running TransDecoder on non-coding regions of {0}.".format(genome))
         tdir = PanGuess.RunTransDecoder(noncoding, tp_path, tl_path, workdir, genome, td_len)
-        
+
         # Move TransDecoder files.
         logging.info("Master: Tidying up TransDecoder temporary files.")
         PanGuess.MoveTransDecoderFiles(tdir)
-        
+
         # Extract TransDecoder attributes.
         logging.info("Master: Converting TransDecoder GTF data to attribute data.")
         trans_attributes = PanGuess.TransDecoderGTFToAttributes(tdir, tag)
 
         # Merge TransDecoder calls into the Exonerate + GeneMark-ES set.
         logging.info("Master: Merging all remmaining gene calls for {0}.".format(genome))
         full_attributes = PanGuess.MergeAttributes(merged_attributes, trans_attributes)
-        
+
         # Write out gene set, protein set and attributes set.
         logging.info("Master: Writing out datasets for {0}.".format(genome))
         PanGuess.ConstructGeneModelSets(full_attributes, exonerate_genes, workdir, genome, tag)
-        
+
         # Compress temporary folders and finish up.
         #logging.info("Master: Compressing temporary folders for {0}.".format(genome))
         #PanGuess.TarballGenePredictionDirs(workdir, genome)
@@ -331,7 +331,7 @@ def PAMLHandler(ml_path, yn_path, refine=False):
         try:
             trans_seqs = PAML.TranslateCDS(cluster)
         except TranslationError as e:
-            print "{0}, {1} has unusual frameshift mutation and can't be run through yn00.".format(e, cluster)
+            print("{0}, {1} has unusual frameshift mutation and can't be run through yn00.".format(e, cluster))
             trans_seqs = None
         if trans_seqs:
             prot_alignment = PAML.MUSCLEAlign(ml_path, trans_seqs)
@@ -525,9 +525,9 @@ def main():
     if ap.pred or ap.pred_only:
         in_date = CheckGeneMarkLicence(start_time)
         if not in_date:
-            print "Your 400-day GeneMark-ES license is out of date and hence PanGloss can't predict genes." \
+            print("Your 400-day GeneMark-ES license is out of date and hence PanGloss can't predict genes." \
                   "Go to http://exon.gatech.edu/GeneMark/gmes_instructions.html to download a new license key," \
-                  "and place it in your home folder under the name .gm_key. Exiting out of Pangloss."
+                  "and place it in your home folder under the name .gm_key. Exiting out of Pangloss.")
             exit(0)
         panguess_args = [ex_path, gm_path, tp_path, tl_path]
         logging.info("Master: Performing gene prediction steps using PanGuess.")
@@ -597,8 +597,8 @@ def main():
     # If enabled, run InterProScan analysis on entire dataset.
     if ap.ips:
         if not sys.platform.startswith("linux"):
-            print "InterProScan is not supported on non-Linux operating systems. Cannot run InterProScan analysis."
-            print "See https://github.com/ebi-pf-team/interproscan/wiki for more information."
+            print("InterProScan is not supported on non-Linux operating systems. Cannot run InterProScan analysis.")
+            print("See https://github.com/ebi-pf-team/interproscan/wiki for more information.")
             pass
         else:
             IPSHandler(ip_path)

diff --git a/Pangloss/BLASTAll.py b/Pangloss/BLASTAll.py
@@ -3,14 +3,14 @@
 BLASTAll: Module for handling parallelized all-vs.-all BLASTp searches, if enabled by user.
 """
 
-import cStringIO
+import io
 import logging
 import multiprocessing as mp
 import subprocess as sp
 
 from Bio import SeqIO, SearchIO
 
-from Tools import StringBLAST
+from .Tools import StringBLAST
 
 
 def BLASTAll(cores=None):
@@ -52,7 +52,7 @@ def MergeBLASTsAndWrite(results):
     # Filter last two lines of each BLASTp result and join remaining lines together, making one big SearchIO object.
     logging.info("BLASTAll: Merging all-vs.-all results together and parsing into tabular format.")
     merged = "\n".join((["\n".join(result.split("\n")[:-2]) for result in results if result]))
-    parsed = SearchIO.parse(cStringIO.StringIO(merged), "blast-tab", comments=True)
+    parsed = SearchIO.parse(io.StringIO(merged), "blast-tab", comments=True)
 
     # Write merged BLASTp results to file for PanOCT.
     logging.info("BLASTAll: Writing BLASTp results to file panoct.blast.")

diff --git a/Pangloss/BUSCO.py b/Pangloss/BUSCO.py
@@ -2,7 +2,7 @@
 import shutil
 import subprocess as sp
 
-from Tools import TryMkDirs
+from .Tools import TryMkDirs
 
 
 def RunBUSCO(buscopath, lineagepath, gene_sets):
@@ -17,6 +17,6 @@ def RunBUSCO(buscopath, lineagepath, gene_sets):
     for gene_set in gene_sets:
         wd = gene_set.split("/")[-1]
         cmd = [buscopath, "-i", gene_set,  "-l", lineagepath, "-o", "{0}.busco".format(wd), "-m", "prot"]
-        print "Running BUSCO"
+        print("Running BUSCO")
         sp.call(cmd)
         shutil.move("run_{0}.busco".format(wd), bdir)
diff --git a/Pangloss/ExonerateGene.py b/Pangloss/ExonerateGene.py
@@ -60,7 +60,7 @@ def __init__(self, string):
                                 prot.append(seq)
                             if "*" in record.seq[:-1]:
                                 stop = True
-                    cds_region = filter(lambda x: len(x) == 3, fragment.aln_annotation["hit_annotation"])
+                    cds_region = [x for x in fragment.aln_annotation["hit_annotation"] if len(x) == 3]
                     nucl.append(str("".join(cds_region)))
 
             # Populate attributes.

diff --git a/Pangloss/GO.py b/Pangloss/GO.py
@@ -7,7 +7,7 @@
 import subprocess as sp
 from csv import reader
 
-from Tools import Flatten, ParseMatchtable, TryMkDirs
+from .Tools import Flatten, ParseMatchtable, TryMkDirs
 
 
 def MakeWorkingDirs():
@@ -62,8 +62,8 @@ def GeneratePopulations(annos, matchtable):
     Write out background (full) population and study (core, accessory) population files for use in GOATools.
     """
     core, acc = ParseMatchtable(matchtable)
-    c_pop = [val for val in Flatten(core.values()) if val in annos]
-    a_pop = [val for val in Flatten(acc.values()) if val in annos]
+    c_pop = [val for val in Flatten(list(core.values())) if val in annos]
+    a_pop = [val for val in Flatten(list(acc.values())) if val in annos]
     full_pop = c_pop + a_pop
     with open("go/core_pop.txt", "w") as cp_file, open("go/acc_pop.txt", "w") as ap_file,\
          open("go/full_pop.txt", "w") as fp_file:

diff --git a/Pangloss/Karyotype.py b/Pangloss/Karyotype.py
@@ -16,7 +16,7 @@
 
 from Bio import SeqIO
 
-from Tools import Flatten, ParseMatchtable, ParseKaryotypes, TryMkDirs
+from .Tools import Flatten, ParseMatchtable, ParseKaryotypes, TryMkDirs
 
 
 def GenerateContigLengths(genomes):
@@ -48,19 +48,19 @@ def GenerateKaryotypeFiles(attributes, matchtable):
 
     for row in attread:
         karyo = [row[0], row[1], row[2], row[3]]
-        core_gms = Flatten(core.values())
-        acc_gms = Flatten(acc.values())
-        total = len(core.values()[0])
+        core_gms = Flatten(list(core.values()))
+        acc_gms = Flatten(list(acc.values()))
+        total = len(list(core.values())[0])
         if row[1] in core_gms:
             number = core_gms.index(row[1]) / total
-            cluster = core.values()[number]
-            ortho = len(filter(lambda x: x is not None, cluster))
+            cluster = list(core.values())[number]
+            ortho = len([x for x in cluster if x is not None])
             karyo = karyo + ["core", row[5], str(ortho)]
             karyotype.append(karyo)
         elif row[1] in acc_gms:
             number = acc_gms.index(row[1]) / total
-            cluster = acc.values()[number]
-            ortho = len(filter(lambda x: x is not None, cluster))
+            cluster = list(acc.values())[number]
+            ortho = len([x for x in cluster if x is not None])
             karyo = karyo + ["acc", row[5], str(ortho)]
             karyotype.append(karyo)
         else:

diff --git a/Pangloss/PAML.py b/Pangloss/PAML.py
@@ -3,15 +3,15 @@
 PAML: Module for handling yn00 selection analysis (and maybe CodeML in the future), if enabled by user.
 """
 
-import cStringIO
+import io
 import os
 
 from Bio import AlignIO, SeqIO
 from Bio.Phylo.PAML import yn00
 from Bio.Phylo.PAML._paml import PamlError
 from glob import glob
 
-from Tools import StringMUSCLE, Untranslate
+from .Tools import StringMUSCLE, Untranslate
 
 
 def TranslateCDS(seqs):
@@ -32,7 +32,7 @@ def MUSCLEAlign(ml_path, seqs):
     Align translated nucleotides in StringMUSCLE, return parsed alignment.
     """
     output = StringMUSCLE(ml_path, seqs)
-    return AlignIO.parse(cStringIO.StringIO(output), "fasta")
+    return AlignIO.parse(io.StringIO(output), "fasta")
 
 
 def PutGaps(alignment, cluster):
@@ -53,7 +53,7 @@ def PutGaps(alignment, cluster):
             unseq.id = seq.id.split("|")[0]
             nucl_aln += (">{0}\n{1}\n".format(unseq.id, unseq.seq))
 
-    fas_aln = AlignIO.read(cStringIO.StringIO(nucl_aln), "fasta")
+    fas_aln = AlignIO.read(io.StringIO(nucl_aln), "fasta")
     AlignIO.write(fas_aln, "{0}.aln".format(cluster), "phylip-sequential")
 
     return "{0}.aln".format(cluster)
@@ -68,7 +68,7 @@ def RunYn00(yn_path, alignment):
     try:
         yn.run(ctl_file=None, command=yn_path, parse=False)
     except PamlError as e:
-        print "{0}, {1} may have internal stop codons.".format(e, alignment)
+        print("{0}, {1} may have internal stop codons.".format(e, alignment))
         pass
 
 

diff --git a/Pangloss/PanGuess.py b/Pangloss/PanGuess.py
@@ -45,7 +45,7 @@
 Maynooth University in 2017-2019 (Charley.McCarthy@nuim.ie).
 """
 
-from __future__ import division
+
 
 import logging
 import multiprocessing as mp
@@ -62,7 +62,7 @@
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 
-from Tools import ExonerateCmdLine, LocationOverlap, Pairwise, TryMkDirs  # get_gene_lengths
+from .Tools import ExonerateCmdLine, LocationOverlap, Pairwise, TryMkDirs  # get_gene_lengths
 
 
 def LengthOverlap(gene, ref_lengths):
@@ -288,7 +288,7 @@ def ExtractNCR(attributes, genome):
 
     # Loop over every contig/chromosome in the genome.
     for seq in db:
-        coding = filter(lambda x: x[0] == seq.id, attributes)
+        coding = [x for x in attributes if x[0] == seq.id]
         for gene, next_gene in Pairwise(coding):
             if coding.index(gene) == 0:
                 if gene[2] != 0:
@@ -377,11 +377,11 @@ def TransDecoderGTFToAttributes(tdir, tag):
             if row:
                 if len(row) == 9:
                     contig_id = re.match(cregex, row[0]).group()[:-5]
-                    global_locs = map(int, row[0].split("_")[-2:])
+                    global_locs = list(map(int, row[0].split("_")[-2:]))
                     if row[2] == "exon":
                         exon_count = exon_count + 1
                     if row[2] == "CDS":
-                        relative_locs = map(int, row[3:5])
+                        relative_locs = list(map(int, row[3:5]))
                         start = global_locs[0] + relative_locs[0] - 1
                         stop = global_locs[0] + relative_locs[1] - 1
                         locs = [start, stop]
@@ -447,7 +447,7 @@ def ConstructGeneModelSets(attributes, exonerate_genes, workdir, genome, tag):
             prot_models.append(prot_seq)
             nucl_models.append(nucl_seq)
         if gene[4].startswith("Exonerate"):
-            match = filter(lambda x: x.id == gene[1], exonerate_genes)
+            match = [x for x in exonerate_genes if x.id == gene[1]]
             prot_seq = SeqRecord(Seq(match[0].prot), id=match[0].id)
             nucl_seq = SeqRecord(Seq(match[0].nucl), id=match[0].id)
             prot_seq.id = "{0}|{1}".format(tag, prot_seq.id)