diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e873878..dd59194 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,7 @@ on: - 'gh_pages' jobs: conda-tests: - name: Test with conda (${{ matrix.os }}) + name: Test with conda (${{ matrix.os }}) Python ${{ matrix.python-version }}) runs-on: ${{ matrix.os }} continue-on-error: ${{ matrix.experimental }} strategy: @@ -17,9 +17,7 @@ jobs: - os: ubuntu-latest pip_cache_path: ~/.cache/pip experimental: false - - os: windows-latest - pip_cache_path: ~\AppData\Local\pip\Cache - experimental: true + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] defaults: run: shell: bash -l {0} # For conda @@ -35,19 +33,20 @@ jobs: uses: actions/cache@v2 with: path: ~/conda_pkgs_dir # from: conda-incubator/setup-miniconda@v2 - key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ + key: ${{ runner.os }}-conda-${{ matrix.python-version }}-${{ env.CACHE_NUMBER }}-${{ hashFiles('conda.yml') }} - name: Cache pip uses: actions/cache@v2 with: path: ${{ matrix.pip_cache_path }} - key: ${{ runner.os }}-pip--${{ env.CACHE_NUMBER }}-${{ + key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ env.CACHE_NUMBER }}-${{ hashFiles('requirements.txt') }} - name: Conda environment setup uses: conda-incubator/setup-miniconda@v2 with: + python-version: ${{ matrix.python-version }} activate-environment: pytoda environment-file: conda.yml auto-activate-base: false @@ -72,7 +71,7 @@ jobs: if: always() with: status: ${{ job.status }} - text: "CI Build ${{ matrix.os }}" + text: "CI Build ${{ matrix.os }} Python ${{ matrix.python-version}}" author_name: ${{ github.actor }} env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/README.md b/README.md index ddfa4b9..f0c7894 100644 --- a/README.md +++ b/README.md @@ -57,41 +57,14 @@ For some examples on how to use `pytoda` see [here](./examples) If you use `pytoda` in your projects, please cite the following: ```bib -@article{born2021datadriven, - author = { - Born, Jannis and Manica, Matteo and Cadow, Joris and Markert, Greta and - Mill,Nil Adell and Filipavicius, Modestas and Janakarajan, Nikita and - Cardinale, Antonio and Laino, Teodoro and - {Rodr{\'{i}}guez Mart{\'{i}}nez}, Mar{\'{i}}a - }, - doi = {10.1088/2632-2153/abe808}, - issn = {2632-2153}, - journal = {Machine Learning: Science and Technology}, - number = {2}, - pages = {025024}, - title = {{ - Data-driven molecular design for discovery and synthesis of novel ligands: - a case study on SARS-CoV-2 - }}, - url = {https://iopscience.iop.org/article/10.1088/2632-2153/abe808}, - volume = {2}, - year = {2021} -} -@article{born2021paccmannrl, - title = { - PaccMann$^{RL}$: De novo generation of hit-like anticancer molecules from - transcriptomic data via reinforcement learning - }, - journal = {iScience}, - volume = {24}, - number = {4}, - year = {2021}, - issn = {2589-0042}, - doi = {https://doi.org/10.1016/j.isci.2021.102269}, - url = {https://www.cell.com/iscience/fulltext/S2589-0042(21)00237-6}, - author = { - Jannis Born and Matteo Manica and Ali Oskooei and Joris Cadow and Greta Markert - and Mar{\'\i}a Rodr{\'\i}guez Mart{\'\i}nez} - } +@article{born2021data, + title={Data-driven molecular design for discovery and synthesis of novel ligands: a case study on SARS-CoV-2}, + author={Born, Jannis and Manica, Matteo and Cadow, Joris and Markert, Greta and Mill, Nil Adell and Filipavicius, Modestas and Janakarajan, Nikita and Cardinale, Antonio and Laino, Teodoro and Martinez, Maria Rodriguez}, + journal={Machine Learning: Science and Technology}, + volume={2}, + number={2}, + pages={025024}, + year={2021}, + publisher={IOP Publishing} } ``` diff --git a/conda.yml b/conda.yml index 687572a..0b8e304 100644 --- a/conda.yml +++ b/conda.yml @@ -1,7 +1,7 @@ name: pytoda dependencies: - - python>=3.8 - - pip>=19.1,<20.3 + - python>=3.7,<3.13 + - pip - pip: - - -r file:requirements.txt + - -r requirements.txt \ No newline at end of file diff --git a/pytoda/__init__.py b/pytoda/__init__.py index 5c1aff0..91f492b 100644 --- a/pytoda/__init__.py +++ b/pytoda/__init__.py @@ -1,2 +1,2 @@ name = 'pytoda' -__version__ = '1.1.5' +__version__ = '1.1.6' diff --git a/pytoda/preprocessing/crawlers.py b/pytoda/preprocessing/crawlers.py index 9a5e3ad..701bb01 100644 --- a/pytoda/preprocessing/crawlers.py +++ b/pytoda/preprocessing/crawlers.py @@ -1,5 +1,4 @@ import logging -import urllib import urllib.request as urllib_request from itertools import filterfalse from typing import Iterable, List, Tuple, Union @@ -13,66 +12,11 @@ logger = logging.getLogger(__name__) -ZINC_DRUG_SEARCH_ROOT = 'http://zinc.docking.org/substances/search/?q=' -ZINC_ID_SEARCH_ROOT = 'http://zinc.docking.org/substances/' - PUBCHEM_START = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound' PUBCHEM_MID = 'property' PUBCHEM_END = 'TXT' -def get_smiles_from_zinc(drug: Union[str, int]) -> str: - """ - Uses the ZINC databases to retrieve the SMILES of a ZINC ID (int) or a drug - name (str). - - Args: - drug (Union[str, int]): a string with a drug name or an int of a ZINC - ID. - Returns: - smiles (str): The SMILES string of the drug name or ZINC ID. - """ - - if type(drug) != str and type(drug) != int: - raise TypeError( - f'Please insert drug of type {{str, int}}, given was {type(drug)}' - f'({drug}).' - ) - - if type(drug) == str: - - # Parse name, then retrieve ZINC ID from it - stripped_drug = unidecode(drug).strip().replace(' ', '%20') - zinc_ids = [] - try: - drug_url = urllib_request.pathname2url(stripped_drug) - path = '{}{}'.format(ZINC_DRUG_SEARCH_ROOT, drug_url) - response = urllib.request.urlopen(path) - - for line in response: - line = line.decode(encoding='UTF-8').strip() - if 'href="/substances/ZINC' in line: - zinc_ids.append(line.split('/')[-2]) - zinc_id = zinc_ids[0] - - except HTTPError: - logger.warning(f'Did not find any result for drug: {drug}') - return '' - - elif type(drug) == int: - zinc_id = str(drug) - - zinc_id_url = ZINC_ID_SEARCH_ROOT + zinc_id - id_response = urllib_request.urlopen(zinc_id_url) - - for id_line in id_response: - id_line = id_line.decode(encoding='UTF-8').strip() - if 'id="substance-smiles-field" readonly value=' in id_line: - smiles = id_line.split('"')[-2] - - return smiles - - def get_smiles_from_pubchem( drug: Union[str, int], query_type: str = 'name', @@ -122,15 +66,13 @@ def get_smiles_from_pubchem( if isinstance(drug, str): drug = unidecode(drug).strip().replace(' ', '%20') - # Search ZINC for compound name + # Search in PubChem for compound name for option in options: try: path = '{}/{}/{}/{}/{}/{}'.format( PUBCHEM_START, query_type, drug, PUBCHEM_MID, option, PUBCHEM_END ) - smiles = ( - urllib_request.urlopen(path).read().decode('UTF-8').replace('\n', '') - ) + smiles = urllib_request.urlopen(path).read().decode('UTF-8').split()[0] if not kekulize: smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles, sanitize=sanitize)) return smiles diff --git a/pytoda/preprocessing/tests/test_crawlers.py b/pytoda/preprocessing/tests/test_crawlers.py index 0f69286..f38e151 100644 --- a/pytoda/preprocessing/tests/test_crawlers.py +++ b/pytoda/preprocessing/tests/test_crawlers.py @@ -1,9 +1,9 @@ """Testing Crawlers.""" + import unittest -from pytoda.preprocessing.crawlers import ( # query_pubchem,; remove_pubchem_smiles, +from pytoda.preprocessing.crawlers import ( get_smiles_from_pubchem, - get_smiles_from_zinc, query_pubchem, remove_pubchem_smiles, ) @@ -12,26 +12,8 @@ class TestCrawlers(unittest.TestCase): """Testing Crawlsers.""" - def test_get_smiles_from_zinc(self) -> None: - """Test get_smiles_from_zinc""" - - # # ZINC is down since quite some time, hence we skip these tests - return True - - # Test text mode - drug = 'Aspirin' - ground_truth = 'CC(=O)Oc1ccccc1C(=O)O' - smiles = get_smiles_from_zinc(drug) - self.assertEqual(smiles, ground_truth) - - # Test ZINC ID mode - zinc_id = 53 - ground_truth = 'CC(=O)Oc1ccccc1C(=O)O' - smiles = get_smiles_from_zinc(zinc_id) - self.assertEqual(smiles, ground_truth) - def test_get_smiles_from_pubchem(self) -> None: - """Test get_smiles_from_zinc""" + """Test get_smiles_from_pubchem""" for sanitize in [True, False]: @@ -83,10 +65,16 @@ def test_get_smiles_from_pubchem(self) -> None: ) self.assertEqual(smiles, ground_truth) + # Test molecule where landing page has several entries + gt_smiles = ( + 'CC12C(C(CC(O1)N3C4=CC=CC=C4C5=C6C(=C7C8=CC=CC=C8N2C7=C53)CNC6=O)NC)OC' + ) + drug = 'Staurosporine' + smiles = get_smiles_from_pubchem(drug, use_isomeric=False, kekulize=True) + self.assertEqual(smiles, gt_smiles) + def test_query_pubchem(self) -> None: """Test query_pubchem""" - # pass - # Disabled due to bug in pubchem api smiles_list = [ 'O1C=CC=NC(=O)C1=O', 'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1', @@ -98,9 +86,6 @@ def test_query_pubchem(self) -> None: def test_remove_pubchem_smiles(self) -> None: """Test remove_pubchem_smiles""" - # pass - - # Disabled due to bug in pubchem api smiles_list = [ 'O1C=CC=NC(=O)C1=O', 'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1', diff --git a/requirements.txt b/requirements.txt index 5b8039c..90c458b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy>=1.19.0 scikit-learn>=0.23.0 pandas>=1.0.0 -torch>=1.4.0,<1.9 +torch>=1.9 diskcache>=5.0.3 dill>=0.3.3 selfies>=2.1.1 diff --git a/setup.py b/setup.py index 0d21f82..be89aea 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ """Package installer.""" + import codecs import os @@ -59,6 +60,11 @@ def get_version(rel_path): 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', 'Topic :: Software Development :: Libraries :: Python Modules', ], packages=find_packages(),