diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fe0d34276..f6041fafc 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -73,6 +73,7 @@ jobs: - name: "Setup Micromamba" uses: mamba-org/setup-micromamba@v1 with: + micromamba-binary-path: ~/.local/bin/micromamba environment-file: environment.yml environment-name: openfe_env cache-environment: true diff --git a/.github/workflows/conda_cron.yaml b/.github/workflows/conda_cron.yaml index 2014340ff..95b3b39dc 100644 --- a/.github/workflows/conda_cron.yaml +++ b/.github/workflows/conda_cron.yaml @@ -34,6 +34,7 @@ jobs: - name: Setup Micromamba uses: mamba-org/setup-micromamba@v1 with: + micromamba-binary-path: ~/.local/bin/micromamba environment-name: openfe create-args: >- python=${{ matrix.python-version }} diff --git a/.github/workflows/installer.yaml b/.github/workflows/installer.yaml index 2bf641852..ace15afdb 100644 --- a/.github/workflows/installer.yaml +++ b/.github/workflows/installer.yaml @@ -37,6 +37,7 @@ jobs: - name: Install constructor environment with Micromamba uses: mamba-org/setup-micromamba@v1 with: + micromamba-binary-path: ~/.local/bin/micromamba environment-name: installer create-args: >- python=3.10 diff --git a/news/pr_937.rst b/news/pr_937.rst new file mode 100644 index 000000000..3b4c61b79 --- /dev/null +++ b/news/pr_937.rst @@ -0,0 +1,32 @@ +**Added:** + +* + +**Changed:** + +* Structural analysis data is no longer written to `structural_analysis.json` + but rather a 32bit numpy compressed file named `structural_analysis.npz` + (PR #937). +* Structural analysis array data is no longer directly returned in the + RelativeHybridTopologyProtocol result dictionary. Instead it should + be accessed from the serialized NPZ file `structural_analysis.npz`. + The `structural_analysis` key now contains a path to the NPZ file, + if the structural analysis did not fail (the `structural_analysis_error` + key will instead be present on failure) (PR #937). + +**Deprecated:** + +* + +**Removed:** + +* + +**Fixed:** + +* The RelativeHybridTopologyProtocol result unit is now much smaller, + due to the removal of structural analysis data (PR #937). + +**Security:** + +* diff --git a/openfe/protocols/openmm_rfe/equil_rfe_methods.py b/openfe/protocols/openmm_rfe/equil_rfe_methods.py index 5d9faebf3..ae3f19619 100644 --- a/openfe/protocols/openmm_rfe/equil_rfe_methods.py +++ b/openfe/protocols/openmm_rfe/equil_rfe_methods.py @@ -36,8 +36,6 @@ from openff.toolkit.topology import Molecule as OFFMolecule from openmmtools import multistate from typing import Optional -from openmm import unit as omm_unit -from openmm.app import PDBFile import pathlib from typing import Any, Iterable, Union import openmmtools @@ -48,7 +46,7 @@ import gufe from gufe import ( settings, ChemicalSystem, LigandAtomMapping, Component, ComponentMapping, - SmallMoleculeComponent, ProteinComponent, SolventComponent, + SmallMoleculeComponent, SolventComponent, ) from .equil_rfe_settings import ( @@ -1096,22 +1094,29 @@ def run(self, *, dry=False, verbose=True, return {'debug': {'sampler': sampler}} @staticmethod - def analyse(where) -> dict: + def structural_analysis(scratch, shared) -> dict: # don't put energy analysis in here, it uses the open file reporter # whereas structural stuff requires that the file handle is closed - analysis_out = where / 'structural_analysis.json' - - ret = subprocess.run(['openfe_analysis', 'RFE_analysis', - str(where), str(analysis_out)], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + # TODO: we should just make openfe_analysis write an npz instead! + analysis_out = scratch / 'structural_analysis.json' + + ret = subprocess.run( + [ + 'openfe_analysis', # CLI entry point + 'RFE_analysis', # CLI option + str(shared), # Where the simulation.nc fille + str(analysis_out) # Where the analysis json file is written + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) if ret.returncode: return {'structural_analysis_error': ret.stderr} with open(analysis_out, 'rb') as f: data = json.load(f) - savedir = pathlib.Path(where) + savedir = pathlib.Path(shared) if d := data['protein_2D_RMSD']: fig = plotting.plot_2D_rmsd(d) fig.savefig(savedir / "protein_2D_RMSD.png") @@ -1124,21 +1129,43 @@ def analyse(where) -> dict: f3.savefig(savedir / "ligand_RMSD.png") plt.close(f3) - return {'structural_analysis': data} + # Save to numpy compressed format (~ 6x more space efficient than JSON) + np.savez_compressed( + shared / "structural_analysis.npz", + protein_RMSD=np.asarray( + data["protein_RMSD"], dtype=np.float32 + ), + ligand_RMSD=np.asarray( + data["ligand_RMSD"], dtype=np.float32 + ), + ligand_COM_drift=np.asarray( + data["ligand_wander"], dtype=np.float32 + ), + protein_2D_RMSD=np.asarray( + data["protein_2D_RMSD"], dtype=np.float32 + ), + time_ps=np.asarray( + data["time(ps)"], dtype=np.int32 + ), + ) + + return {'structural_analysis': shared / "structural_analysis.npz"} def _execute( self, ctx: gufe.Context, **kwargs, ) -> dict[str, Any]: log_system_probe(logging.INFO, paths=[ctx.scratch]) - + outputs = self.run(scratch_basepath=ctx.scratch, shared_basepath=ctx.shared) - analysis_outputs = self.analyse(ctx.shared) + structural_analysis_outputs = self.structural_analysis( + ctx.scratch, ctx.shared + ) return { 'repeat_id': self._inputs['repeat_id'], 'generation': self._inputs['generation'], **outputs, - **analysis_outputs, + **structural_analysis_outputs, } diff --git a/openfe/tests/protocols/test_openmm_equil_rfe_protocols.py b/openfe/tests/protocols/test_openmm_equil_rfe_protocols.py index e420d7ccf..a0dcb3b66 100644 --- a/openfe/tests/protocols/test_openmm_equil_rfe_protocols.py +++ b/openfe/tests/protocols/test_openmm_equil_rfe_protocols.py @@ -5,16 +5,14 @@ import numpy as np from numpy.testing import assert_allclose import gufe -from gufe.tests.test_tokenization import GufeTokenizableTestsMixin import json import xml.etree.ElementTree as ET from importlib import resources from unittest import mock import sys +from pathlib import Path -import gufe import mdtraj as mdt -import numpy as np import pytest from openff.toolkit import Molecule from openff.units import unit @@ -249,7 +247,8 @@ def test_dry_run_gaff_vacuum(benzene_vacuum_system, toluene_vacuum_system, mapping=benzene_to_toluene_mapping, ) unit = list(dag.protocol_units)[0] - sampler = unit.run(dry=True)["debug"]["sampler"] + with tmpdir.as_cwd(): + _ = unit.run(dry=True)["debug"]["sampler"] @pytest.mark.slow @@ -441,7 +440,7 @@ def test_confgen_mocked_fail(benzene_system, toluene_system, @pytest.fixture(scope='session') def tip4p_hybrid_factory( benzene_system, toluene_system, - benzene_to_toluene_mapping, tmp_path_factory + benzene_to_toluene_mapping, tmp_path_factory, ): """ Hybrid system with virtual sites in the environment (waters) @@ -471,9 +470,9 @@ def tip4p_hybrid_factory( scratch_temp = tmp_path_factory.mktemp("tip4p_scratch") dag_unit_result = dag_unit.run( - dry=True, - scratch_basepath=scratch_temp, - shared_basepath=shared_temp, + dry=True, + scratch_basepath=scratch_temp, + shared_basepath=shared_temp, ) return dag_unit_result['debug']['sampler']._factory @@ -1545,7 +1544,9 @@ def test_reload_protocol_result(self, rfe_transformation_json): d = json.loads(rfe_transformation_json, cls=gufe.tokenization.JSON_HANDLER.decoder) - pr = openmm_rfe.RelativeHybridTopologyProtocolResult.from_dict(d['protocol_result']) + pr = openmm_rfe.RelativeHybridTopologyProtocolResult.from_dict( + d['protocol_result'] + ) assert pr @@ -1642,6 +1643,7 @@ def test_filenotfound_replica_states(self, protocolresult): with pytest.raises(ValueError, match=errmsg): protocolresult.get_replica_states() + @pytest.mark.parametrize('mapping_name,result', [ ["benzene_to_toluene_mapping", 0], ["benzene_to_benzoic_mapping", 1], @@ -1659,12 +1661,12 @@ def test_get_charge_difference(mapping_name, result, request): val = _get_alchemical_charge_difference( mapping, 'pme', True, openfe.SolventComponent() ) - assert result == pytest.approx(result) + assert result == pytest.approx(val) else: val = _get_alchemical_charge_difference( mapping, 'pme', True, openfe.SolventComponent() ) - assert result == pytest.approx(result) + assert result == pytest.approx(val) def test_get_charge_difference_no_pme(benzene_to_benzoic_mapping): @@ -2102,3 +2104,14 @@ def test_dry_run_complex_alchemwater_totcharge( assert len(htf._atom_classes['core_atoms']) == core_atoms assert len(htf._atom_classes['unique_new_atoms']) == new_uniq assert len(htf._atom_classes['unique_old_atoms']) == old_uniq + + +def test_structural_analysis_error(tmpdir): + + with tmpdir.as_cwd(): + ret = openmm_rfe.RelativeHybridTopologyProtocolUnit.structural_analysis( + Path('.'), Path('.') + ) + + assert 'structural_analysis_error' in ret + assert 'structural_analysis' not in ret diff --git a/openfe/tests/protocols/test_openmm_rfe_slow.py b/openfe/tests/protocols/test_openmm_rfe_slow.py index c1f4f6a5f..c191b554e 100644 --- a/openfe/tests/protocols/test_openmm_rfe_slow.py +++ b/openfe/tests/protocols/test_openmm_rfe_slow.py @@ -1,6 +1,7 @@ # This code is part of OpenFE and is licensed under the MIT license. # For details, see https://github.com/OpenFreeEnergy/openfe - +import numpy as np +from numpy.testing import assert_equal from gufe.protocols import execute_DAG import pytest from openff.units import unit @@ -42,8 +43,8 @@ def test_openmm_run_engine(benzene_vacuum_system, platform, if platform not in available_platforms: pytest.skip(f"OpenMM Platform: {platform} not available") # this test actually runs MD - # if this passes, you're 99% likely to have a good time - # these settings are a small self to self sim, that has enough eq that it doesn't occasionally crash + # these settings are a small self to self sim, that has enough eq that + # it doesn't occasionally crash s = openfe.protocols.openmm_rfe.RelativeHybridTopologyProtocol.default_settings() s.simulation_settings.equilibration_length = 0.1 * unit.picosecond s.simulation_settings.production_length = 0.1 * unit.picosecond @@ -64,10 +65,16 @@ def test_openmm_run_engine(benzene_vacuum_system, platform, 'ligand': b_alt }) - m = openfe.LigandAtomMapping(componentA=b, componentB=b_alt, - componentA_to_componentB={i: i for i in range(12)}) - dag = p.create(stateA=benzene_vacuum_system, stateB=benzene_vacuum_alt_system, - mapping=[m]) + m = openfe.LigandAtomMapping( + componentA=b, + componentB=b_alt, + componentA_to_componentB={i: i for i in range(12)} + ) + dag = p.create( + stateA=benzene_vacuum_system, + stateB=benzene_vacuum_alt_system, + mapping=[m] + ) cwd = pathlib.Path(str(tmpdir)) r = execute_DAG(dag, shared_basedir=cwd, scratch_basedir=cwd, @@ -78,13 +85,40 @@ def test_openmm_run_engine(benzene_vacuum_system, platform, unit_shared = tmpdir / f"shared_{pur.source_key}_attempt_0" assert unit_shared.exists() assert pathlib.Path(unit_shared).is_dir() + + # Check the checkpoint file exists checkpoint = pur.outputs['last_checkpoint'] assert checkpoint == "checkpoint.chk" assert (unit_shared / checkpoint).exists() + + # Check the nc simulation file exists + # TODO: assert the number of frames nc = pur.outputs['nc'] assert nc == unit_shared / "simulation.nc" assert nc.exists() - assert (unit_shared / "structural_analysis.json").exists() + + # Check structural analysis contents + structural_analysis_file = unit_shared / "structural_analysis.npz" + assert (structural_analysis_file).exists() + assert pur.outputs['structural_analysis'] == structural_analysis_file + + structural_data = np.load(pur.outputs['structural_analysis']) + structural_keys = [ + 'protein_RMSD', 'ligand_RMSD', 'ligand_COM_drift', + 'protein_2D_RMSD', 'time_ps' + ] + for key in structural_keys: + assert key in structural_data.keys() + + # 6 frames being written to file + # Note: the values of this next test are wrong, but in an expected way + # See: https://github.com/OpenFreeEnergy/openfe_analysis/issues/33 + assert_equal(structural_data['time_ps'], [0, 50, 100, 150, 200, 250]) + assert structural_data['ligand_RMSD'].shape == (11, 6) + assert structural_data['ligand_COM_drift'].shape == (11, 6) + # No protein so should be empty + assert structural_data['protein_RMSD'].size == 0 + assert structural_data['protein_2D_RMSD'].size == 0 # Test results methods that need files present results = p.gather([r])