Skip to content

Commit

Permalink
Crawlers (#165)
Browse files Browse the repository at this point in the history
* fix: correct pubchem crawling from smiles with multiple entries

* pin: numpy

* chore: remove Zinc API

* ci: expand tests to python 3.7-3.11

* doc: Update README

* ci: Span matrix

* ci: remove windows tests

* ci: remove trailing zero

* chore: add support for torch 2 and python 3.11

* ci: test 3.7-3.12

* ci: enable 3.12
  • Loading branch information
jannisborn authored Oct 4, 2024
1 parent 78ad7c9 commit 2516af1
Show file tree
Hide file tree
Showing 8 changed files with 39 additions and 134 deletions.
13 changes: 6 additions & 7 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
- 'gh_pages'
jobs:
conda-tests:
name: Test with conda (${{ matrix.os }})
name: Test with conda (${{ matrix.os }}) Python ${{ matrix.python-version }})
runs-on: ${{ matrix.os }}
continue-on-error: ${{ matrix.experimental }}
strategy:
Expand All @@ -17,9 +17,7 @@ jobs:
- os: ubuntu-latest
pip_cache_path: ~/.cache/pip
experimental: false
- os: windows-latest
pip_cache_path: ~\AppData\Local\pip\Cache
experimental: true
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
defaults:
run:
shell: bash -l {0} # For conda
Expand All @@ -35,19 +33,20 @@ jobs:
uses: actions/cache@v2
with:
path: ~/conda_pkgs_dir # from: conda-incubator/setup-miniconda@v2
key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{
key: ${{ runner.os }}-conda-${{ matrix.python-version }}-${{ env.CACHE_NUMBER }}-${{
hashFiles('conda.yml') }}

- name: Cache pip
uses: actions/cache@v2
with:
path: ${{ matrix.pip_cache_path }}
key: ${{ runner.os }}-pip--${{ env.CACHE_NUMBER }}-${{
key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ env.CACHE_NUMBER }}-${{
hashFiles('requirements.txt') }}

- name: Conda environment setup
uses: conda-incubator/setup-miniconda@v2
with:
python-version: ${{ matrix.python-version }}
activate-environment: pytoda
environment-file: conda.yml
auto-activate-base: false
Expand All @@ -72,7 +71,7 @@ jobs:
if: always()
with:
status: ${{ job.status }}
text: "CI Build ${{ matrix.os }}"
text: "CI Build ${{ matrix.os }} Python ${{ matrix.python-version}}"
author_name: ${{ github.actor }}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
Expand Down
45 changes: 9 additions & 36 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,41 +57,14 @@ For some examples on how to use `pytoda` see [here](./examples)
If you use `pytoda` in your projects, please cite the following:

```bib
@article{born2021datadriven,
author = {
Born, Jannis and Manica, Matteo and Cadow, Joris and Markert, Greta and
Mill,Nil Adell and Filipavicius, Modestas and Janakarajan, Nikita and
Cardinale, Antonio and Laino, Teodoro and
{Rodr{\'{i}}guez Mart{\'{i}}nez}, Mar{\'{i}}a
},
doi = {10.1088/2632-2153/abe808},
issn = {2632-2153},
journal = {Machine Learning: Science and Technology},
number = {2},
pages = {025024},
title = {{
Data-driven molecular design for discovery and synthesis of novel ligands:
a case study on SARS-CoV-2
}},
url = {https://iopscience.iop.org/article/10.1088/2632-2153/abe808},
volume = {2},
year = {2021}
}
@article{born2021paccmannrl,
title = {
PaccMann$^{RL}$: De novo generation of hit-like anticancer molecules from
transcriptomic data via reinforcement learning
},
journal = {iScience},
volume = {24},
number = {4},
year = {2021},
issn = {2589-0042},
doi = {https://doi.org/10.1016/j.isci.2021.102269},
url = {https://www.cell.com/iscience/fulltext/S2589-0042(21)00237-6},
author = {
Jannis Born and Matteo Manica and Ali Oskooei and Joris Cadow and Greta Markert
and Mar{\'\i}a Rodr{\'\i}guez Mart{\'\i}nez}
}
@article{born2021data,
title={Data-driven molecular design for discovery and synthesis of novel ligands: a case study on SARS-CoV-2},
author={Born, Jannis and Manica, Matteo and Cadow, Joris and Markert, Greta and Mill, Nil Adell and Filipavicius, Modestas and Janakarajan, Nikita and Cardinale, Antonio and Laino, Teodoro and Martinez, Maria Rodriguez},
journal={Machine Learning: Science and Technology},
volume={2},
number={2},
pages={025024},
year={2021},
publisher={IOP Publishing}
}
```
6 changes: 3 additions & 3 deletions conda.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: pytoda
dependencies:
- python>=3.8
- pip>=19.1,<20.3
- python>=3.7,<3.13
- pip
- pip:
- -r file:requirements.txt
- -r requirements.txt

2 changes: 1 addition & 1 deletion pytoda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name = 'pytoda'
__version__ = '1.1.5'
__version__ = '1.1.6'
62 changes: 2 additions & 60 deletions pytoda/preprocessing/crawlers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import urllib
import urllib.request as urllib_request
from itertools import filterfalse
from typing import Iterable, List, Tuple, Union
Expand All @@ -13,66 +12,11 @@

logger = logging.getLogger(__name__)

ZINC_DRUG_SEARCH_ROOT = 'http://zinc.docking.org/substances/search/?q='
ZINC_ID_SEARCH_ROOT = 'http://zinc.docking.org/substances/'

PUBCHEM_START = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound'
PUBCHEM_MID = 'property'
PUBCHEM_END = 'TXT'


def get_smiles_from_zinc(drug: Union[str, int]) -> str:
"""
Uses the ZINC databases to retrieve the SMILES of a ZINC ID (int) or a drug
name (str).
Args:
drug (Union[str, int]): a string with a drug name or an int of a ZINC
ID.
Returns:
smiles (str): The SMILES string of the drug name or ZINC ID.
"""

if type(drug) != str and type(drug) != int:
raise TypeError(
f'Please insert drug of type {{str, int}}, given was {type(drug)}'
f'({drug}).'
)

if type(drug) == str:

# Parse name, then retrieve ZINC ID from it
stripped_drug = unidecode(drug).strip().replace(' ', '%20')
zinc_ids = []
try:
drug_url = urllib_request.pathname2url(stripped_drug)
path = '{}{}'.format(ZINC_DRUG_SEARCH_ROOT, drug_url)
response = urllib.request.urlopen(path)

for line in response:
line = line.decode(encoding='UTF-8').strip()
if 'href="/substances/ZINC' in line:
zinc_ids.append(line.split('/')[-2])
zinc_id = zinc_ids[0]

except HTTPError:
logger.warning(f'Did not find any result for drug: {drug}')
return ''

elif type(drug) == int:
zinc_id = str(drug)

zinc_id_url = ZINC_ID_SEARCH_ROOT + zinc_id
id_response = urllib_request.urlopen(zinc_id_url)

for id_line in id_response:
id_line = id_line.decode(encoding='UTF-8').strip()
if 'id="substance-smiles-field" readonly value=' in id_line:
smiles = id_line.split('"')[-2]

return smiles


def get_smiles_from_pubchem(
drug: Union[str, int],
query_type: str = 'name',
Expand Down Expand Up @@ -122,15 +66,13 @@ def get_smiles_from_pubchem(
if isinstance(drug, str):
drug = unidecode(drug).strip().replace(' ', '%20')

# Search ZINC for compound name
# Search in PubChem for compound name
for option in options:
try:
path = '{}/{}/{}/{}/{}/{}'.format(
PUBCHEM_START, query_type, drug, PUBCHEM_MID, option, PUBCHEM_END
)
smiles = (
urllib_request.urlopen(path).read().decode('UTF-8').replace('\n', '')
)
smiles = urllib_request.urlopen(path).read().decode('UTF-8').split()[0]
if not kekulize:
smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles, sanitize=sanitize))
return smiles
Expand Down
37 changes: 11 additions & 26 deletions pytoda/preprocessing/tests/test_crawlers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Testing Crawlers."""

import unittest

from pytoda.preprocessing.crawlers import ( # query_pubchem,; remove_pubchem_smiles,
from pytoda.preprocessing.crawlers import (
get_smiles_from_pubchem,
get_smiles_from_zinc,
query_pubchem,
remove_pubchem_smiles,
)
Expand All @@ -12,26 +12,8 @@
class TestCrawlers(unittest.TestCase):
"""Testing Crawlsers."""

def test_get_smiles_from_zinc(self) -> None:
"""Test get_smiles_from_zinc"""

# # ZINC is down since quite some time, hence we skip these tests
return True

# Test text mode
drug = 'Aspirin'
ground_truth = 'CC(=O)Oc1ccccc1C(=O)O'
smiles = get_smiles_from_zinc(drug)
self.assertEqual(smiles, ground_truth)

# Test ZINC ID mode
zinc_id = 53
ground_truth = 'CC(=O)Oc1ccccc1C(=O)O'
smiles = get_smiles_from_zinc(zinc_id)
self.assertEqual(smiles, ground_truth)

def test_get_smiles_from_pubchem(self) -> None:
"""Test get_smiles_from_zinc"""
"""Test get_smiles_from_pubchem"""

for sanitize in [True, False]:

Expand Down Expand Up @@ -83,10 +65,16 @@ def test_get_smiles_from_pubchem(self) -> None:
)
self.assertEqual(smiles, ground_truth)

# Test molecule where landing page has several entries
gt_smiles = (
'CC12C(C(CC(O1)N3C4=CC=CC=C4C5=C6C(=C7C8=CC=CC=C8N2C7=C53)CNC6=O)NC)OC'
)
drug = 'Staurosporine'
smiles = get_smiles_from_pubchem(drug, use_isomeric=False, kekulize=True)
self.assertEqual(smiles, gt_smiles)

def test_query_pubchem(self) -> None:
"""Test query_pubchem"""
# pass
# Disabled due to bug in pubchem api
smiles_list = [
'O1C=CC=NC(=O)C1=O',
'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1',
Expand All @@ -98,9 +86,6 @@ def test_query_pubchem(self) -> None:

def test_remove_pubchem_smiles(self) -> None:
"""Test remove_pubchem_smiles"""
# pass

# Disabled due to bug in pubchem api
smiles_list = [
'O1C=CC=NC(=O)C1=O',
'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1',
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
numpy>=1.19.0
scikit-learn>=0.23.0
pandas>=1.0.0
torch>=1.4.0,<1.9
torch>=1.9
diskcache>=5.0.3
dill>=0.3.3
selfies>=2.1.1
Expand Down
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Package installer."""

import codecs
import os

Expand Down Expand Up @@ -59,6 +60,11 @@ def get_version(rel_path):
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
'Topic :: Software Development :: Libraries :: Python Modules',
],
packages=find_packages(),
Expand Down

0 comments on commit 2516af1

Please sign in to comment.