adding first pass tools (#5)

whitead · Oct 6, 2023 · 640b57a · 640b57a
1 parent de237d4
commit 640b57a
Show file tree

Hide file tree

Showing 3 changed files with 112 additions and 0 deletions.
diff --git a/sanclone/tools/load_insert.py b/sanclone/tools/load_insert.py
@@ -0,0 +1,47 @@
+from Bio import Entrez, SeqIO
+import json
+
+
+from langchain.tools import BaseTool
+
+from ..state import State
+
+
+class ParseGeneTool(BaseTool):
+    name = "parse_genes"
+    description = "a tool that parses in the virus prompt"
+    shared_state: State
+
+    def _run(self, query: str) -> str:
+        # Assume query is a json object of the form {"gene_name": "gene_name", "organism": "organism"}
+        qson = json.loads(query)
+        gene_name = qson['gene_name']
+        organism = qson['organism']
+        seq_record = fetch_sequence(gene_name, organism)
+        if seq_record is not None:
+            self.shared_state.linear_insert = seq_record
+            return f"Sequence {seq_record.description} is loaded. "
+        else:
+            return "Could not find Sequence"
+
+def fetch_sequence(gene_name, organism):
+    Entrez.email = "your.email@example.com"  # Always tell NCBI who you are
+    search_term = f"{gene_name}[Gene Name] AND {organism}[Organism] AND mRNA[Filter]"
+
+    # Search for the gene's mRNA ID
+    handle = Entrez.esearch(db="nucleotide", term=search_term, retmax=1)
+    record = Entrez.read(handle)
+    handle.close()
+
+    if not record["IdList"]:
+        # print("No sequence found!")
+        return None
+
+    gene_id = record["IdList"][0]
+
+    # Fetch the sequence based on the ID
+    handle = Entrez.efetch(db="nucleotide", id=gene_id, rettype="fasta", retmode="text")
+    seq_record = SeqIO.read(handle, "fasta")
+    handle.close()
+
+    return seq_record
diff --git a/sanclone/tools/load_virus.py b/sanclone/tools/load_virus.py
@@ -0,0 +1,63 @@
+from langchain.tools import BaseTool
+from Bio import Entrez
+from Bio import SeqIO
+import os
+
+from ..state import State
+from sanclone.tools import settings
+
+
+class ParseVirusTool(BaseTool):
+    name = "parse_virus"
+    description = "a tool that parses in the virus prompt"
+    shared_state: State
+
+    def _run(self, query: str) -> str:
+        # Assume vector is vector name ParseVirusTool()._run('pET-16b') -> seq Record
+        genbank_filename = get_vector_data(query, settings.OUTPUT_FOLDER)
+        seqObj = list(SeqIO.parse(open(genbank_filename,"r"), "genbank"))[0]
+        if seqObj is not None:
+            self.shared_state.vector = seqObj
+            return f"Vector {seqObj.description} is loaded. "
+        else:
+            return "Could not find Vector"
+
+
+def get_vector_data(vector_name):
+    # Set your email address for NCBI Entrez. This is required.
+    output_folder = settings.OUTPUT_FOLDER
+    Entrez.email = settings.email
+
+    # Define the search query using the vector_name input
+    search_query = vector_name
+
+    # Use Entrez to search for GenBank records
+    search_handle = Entrez.esearch(db="nucleotide", term=search_query)
+    search_results = Entrez.read(search_handle)
+    search_handle.close()
+
+    # Check if any results were found
+    if "IdList" not in search_results or not search_results["IdList"]:
+        print(f"No GenBank records found for {vector_name}")
+        return
+
+    # Extract the first GenBank ID from the search results
+    genbank_id = search_results["IdList"][0]
+
+    # Download the GenBank record and save it to a file
+    fetch_handle = Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text")
+    genbank_record = SeqIO.read(fetch_handle, "genbank")
+    fetch_handle.close()
+
+    # Save the GenBank record to a file
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    filename = os.path.join(output_folder, f"{genbank_record.id}.gbk")
+    SeqIO.write(genbank_record, filename, "genbank")
+
+    #print(f"Downloaded GenBank file for {vector_name} to {filename}")
+    return filename
+
+
+def get_genbank_from_soup(query):
+    return None
diff --git a/sanclone/tools/settings.py b/sanclone/tools/settings.py
@@ -0,0 +1,2 @@
+email = 'youremail@example.com'
+OUTPUT_FOLDER = './'