Skip to content

Commit

Permalink
adding first pass tools (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
albertqu authored Oct 6, 2023
1 parent de237d4 commit 640b57a
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 0 deletions.
47 changes: 47 additions & 0 deletions sanclone/tools/load_insert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from Bio import Entrez, SeqIO
import json


from langchain.tools import BaseTool

from ..state import State


class ParseGeneTool(BaseTool):
name = "parse_genes"
description = "a tool that parses in the virus prompt"
shared_state: State

def _run(self, query: str) -> str:
# Assume query is a json object of the form {"gene_name": "gene_name", "organism": "organism"}
qson = json.loads(query)
gene_name = qson['gene_name']
organism = qson['organism']
seq_record = fetch_sequence(gene_name, organism)
if seq_record is not None:
self.shared_state.linear_insert = seq_record
return f"Sequence {seq_record.description} is loaded. "
else:
return "Could not find Sequence"

def fetch_sequence(gene_name, organism):
Entrez.email = "your.email@example.com" # Always tell NCBI who you are
search_term = f"{gene_name}[Gene Name] AND {organism}[Organism] AND mRNA[Filter]"

# Search for the gene's mRNA ID
handle = Entrez.esearch(db="nucleotide", term=search_term, retmax=1)
record = Entrez.read(handle)
handle.close()

if not record["IdList"]:
# print("No sequence found!")
return None

gene_id = record["IdList"][0]

# Fetch the sequence based on the ID
handle = Entrez.efetch(db="nucleotide", id=gene_id, rettype="fasta", retmode="text")
seq_record = SeqIO.read(handle, "fasta")
handle.close()

return seq_record
63 changes: 63 additions & 0 deletions sanclone/tools/load_virus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from langchain.tools import BaseTool
from Bio import Entrez
from Bio import SeqIO
import os

from ..state import State
from sanclone.tools import settings


class ParseVirusTool(BaseTool):
name = "parse_virus"
description = "a tool that parses in the virus prompt"
shared_state: State

def _run(self, query: str) -> str:
# Assume vector is vector name ParseVirusTool()._run('pET-16b') -> seq Record
genbank_filename = get_vector_data(query, settings.OUTPUT_FOLDER)
seqObj = list(SeqIO.parse(open(genbank_filename,"r"), "genbank"))[0]
if seqObj is not None:
self.shared_state.vector = seqObj
return f"Vector {seqObj.description} is loaded. "
else:
return "Could not find Vector"


def get_vector_data(vector_name):
# Set your email address for NCBI Entrez. This is required.
output_folder = settings.OUTPUT_FOLDER
Entrez.email = settings.email

# Define the search query using the vector_name input
search_query = vector_name

# Use Entrez to search for GenBank records
search_handle = Entrez.esearch(db="nucleotide", term=search_query)
search_results = Entrez.read(search_handle)
search_handle.close()

# Check if any results were found
if "IdList" not in search_results or not search_results["IdList"]:
print(f"No GenBank records found for {vector_name}")
return

# Extract the first GenBank ID from the search results
genbank_id = search_results["IdList"][0]

# Download the GenBank record and save it to a file
fetch_handle = Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text")
genbank_record = SeqIO.read(fetch_handle, "genbank")
fetch_handle.close()

# Save the GenBank record to a file
if not os.path.exists(output_folder):
os.makedirs(output_folder)
filename = os.path.join(output_folder, f"{genbank_record.id}.gbk")
SeqIO.write(genbank_record, filename, "genbank")

#print(f"Downloaded GenBank file for {vector_name} to {filename}")
return filename


def get_genbank_from_soup(query):
return None
2 changes: 2 additions & 0 deletions sanclone/tools/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
email = 'youremail@example.com'
OUTPUT_FOLDER = './'

0 comments on commit 640b57a

Please sign in to comment.