Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split gather #502

Merged
merged 13 commits into from
Aug 2, 2023
1 change: 1 addition & 0 deletions openfecli/clicktypes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .hyphenchoice import HyphenAwareChoice
13 changes: 13 additions & 0 deletions openfecli/clicktypes/hyphenchoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import click

def _normalize_to_hyphen(string):
return string.replace("_", "-")

class HyphenAwareChoice(click.Choice):
def __init__(self, choices, case_sensitive=True):
choices = [_normalize_to_hyphen(choice) for choice in choices]
super().__init__(choices, case_sensitive)

def convert(self, value, param, ctx):
value = _normalize_to_hyphen(value)
return super().convert(value, param, ctx)
275 changes: 171 additions & 104 deletions openfecli/commands/gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,43 @@

import click
from openfecli import OFECommandPlugin
from openfecli.clicktypes import HyphenAwareChoice
import pathlib


def _get_column(val):
import numpy as np
if val == 0:
return 0

log10 = np.log10(val)

if log10 >= 0.0:
col = np.floor(log10 + 1)
else:
col = np.floor(log10)
return int(col)

dwhswenson marked this conversation as resolved.
Show resolved Hide resolved

def format_estimate_uncertainty(
est: float,
unc: float,
unc_prec: int = 1,
) -> tuple[str, str]:
import numpy as np
# get the last column needed for uncertainty
unc_col = _get_column(unc) - (unc_prec - 1)

if unc_col < 0:
est_str = f"{est:.{-unc_col}f}"
unc_str = f"{unc:.{-unc_col}f}"
else:
est_str = f"{np.round(est, -unc_col + 1)}"
unc_str = f"{np.round(unc, -unc_col + 1)}"

return est_str, unc_str


def is_results_json(f):
# sanity check on files before we try and deserialize
return 'estimate' in open(f, 'r').read(20)
Expand Down Expand Up @@ -51,85 +85,8 @@ def legacy_get_type(res_fn):
return 'complex'


@click.command(
'gather',
short_help="Gather result jsons for network of RFE results into a TSV file"
)
@click.argument('rootdir',
type=click.Path(dir_okay=True, file_okay=False,
path_type=pathlib.Path),
required=True)
@click.option('output', '-o',
type=click.File(mode='w'),
default='-')
def gather(rootdir, output):
"""Gather simulation result jsons of relative calculations to a tsv file

Will walk ROOTDIR recursively and find all results files ending in .json
(i.e those produced by the quickrun command). Each of these contains the
results of a separate leg from a relative free energy thermodynamic cycle.

Paired legs of simulations will be combined to give the DDG values between
two ligands in the corresponding phase, producing either binding ('DDGbind')
or hydration ('DDGhyd') relative free energies. These will be reported as
'DDGbind(B,A)' meaning DGbind(B) - DGbind(A), the difference in free energy
of binding for ligand B relative to ligand A.

Individual leg results will be also be written. These are reported as
either DGvacuum(A,B) DGsolvent(A,B) or DGcomplex(A,B) for the vacuum,
solvent or complex free energy of transmuting ligand A to ligand B.

\b
Will produce a **tab** separated file with 6 columns:
1) a description of the measurement, for example DDGhyd(A, B)
2) the type of this measurement, either RBFE or RHFE
3) the identifier of the first ligand
4) the identifier of the second ligand
5) the estimated value (in kcal/mol)
6) the uncertainty on the value (also kcal/mol)

By default, outputs to stdout, use -o option to choose file.
"""
from collections import defaultdict
import glob
import networkx as nx
def _get_ddgs(legs):
import numpy as np
from cinnabar.stats import mle

def dp2(v: float) -> str:
# turns 0.0012345 -> '0.0012', round() would get this wrong
return np.format_float_positional(v, precision=2, trim='0',
fractional=False)

# 1) find all possible jsons
json_fns = glob.glob(str(rootdir) + '/**/*json', recursive=True)

# 2) filter only result jsons
result_fns = filter(is_results_json, json_fns)

# 3) pair legs of simulations together into dict of dicts
legs = defaultdict(dict)

for result_fn in result_fns:
result = load_results(result_fn)
if result is None:
continue
elif result['estimate'] is None or result['uncertainty'] is None:
click.echo(f"WARNING: Calculations for {result_fn} did not finish succesfully!",
err=True)

try:
names = get_names(result)
except KeyError:
raise ValueError("Failed to guess names")
try:
simtype = get_type(result)
except KeyError:
simtype = legacy_get_type(result_fn)

legs[names][simtype] = result['estimate'], result['uncertainty']

# 4a for each ligand pair, resolve legs
DDGs = []
for ligpair, vals in sorted(legs.items()):
DDGbind = None
Expand All @@ -144,15 +101,54 @@ def dp2(v: float) -> str:
# DDG(2,1)bind = DG(1->2)complex - DG(1->2)solvent
DDGbind = (DG1_mag - DG2_mag).m
bind_unc = np.sqrt(np.sum(np.square([DG1_unc.m, DG2_unc.m])))
if 'solvent' in vals and 'vacuum' in vals:
elif 'solvent' in vals and 'vacuum' in vals:
DG1_mag, DG1_unc = vals['solvent']
DG2_mag, DG2_unc = vals['vacuum']
if not ((DG1_mag is None) or (DG2_mag is None)):
DDGhyd = (DG1_mag - DG2_mag).m
hyd_unc = np.sqrt(np.sum(np.square([DG1_unc.m, DG2_unc.m])))
else: # -no-cov-
raise RuntimeError(f"Unknown DDG type for {vals}")

DDGs.append((*ligpair, DDGbind, bind_unc, DDGhyd, hyd_unc))

return DDGs

dwhswenson marked this conversation as resolved.
Show resolved Hide resolved

def _write_ddg(legs, writer):
DDGs = _get_ddgs(legs)
writer.writerow(["ligand_i", "ligand_j", "DDG(i->j) (kcal/mol)",
"uncertainty (kcal/mol)"])
for ligA, ligB, DDGbind, bind_unc, DDGhyd, hyd_unc in DDGs:
name = f"{ligB}, {ligA}"
if DDGbind is not None:
DDGbind, bind_unc = format_estimate_uncertainty(DDGbind,
bind_unc)
writer.writerow([ligA, ligB, DDGbind, bind_unc])
if DDGhyd is not None:
DDGhyd, hyd_unc = format_estimate_uncertainty(DDGbind,
bind_unc)
writer.writerow([ligA, ligB, DDGhyd, hyd_unc])

dwhswenson marked this conversation as resolved.
Show resolved Hide resolved

def _write_dg_raw(legs, writer):
writer.writerow(["leg", "ligand_i", "ligand_j", "DG(i->j) (kcal/mol)",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in the optimal case I would like this to write out all the individual replicas. Any thoughts on the necessary changes to results to ensure this? (I suspect it's a method we'd have to add at the gufe level)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be protocol-specific. Our default protocol has, in its outputs dict, outputs["unit_estimate"], which we could extract. But we make no promise that every unit will have this. Indeed, it doesn't make sense to do so: if you used a separate parametrization unit, there would be no meaning to that unit having a "unit_estimate" result.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might be something we need to add to the abstract class - a method for getting a breakdown by repeat.

"uncertainty (kcal/mol)"])
for ligpair, vals in sorted(legs.items()):
name = ', '.join(ligpair)
for simtype, (m, u) in sorted(vals.items()):
if m is None:
m, u = 'NaN', 'NaN'
else:
m, u = format_estimate_uncertainty(m.m, u.m)
writer.writerow([simtype, *ligpair, m, u])

dwhswenson marked this conversation as resolved.
Show resolved Hide resolved

def _write_dg_mle(legs, writer):
import networkx as nx
import numpy as np
from cinnabar.stats import mle
DDGs = _get_ddgs(legs)
MLEs = []
# 4b) perform MLE
g = nx.DiGraph()
Expand Down Expand Up @@ -193,39 +189,110 @@ def dp2(v: float) -> str:
ligname = idx_to_nm[node]
MLEs.append((ligname, f, df))

output.write('measurement\ttype\tligand_i\tligand_j\testimate (kcal/mol)'
'\tuncertainty (kcal/mol)\n')
# 5a) write out MLE values
writer.writerow(["ligand", "DG(MLE) (kcal/mol)",
"uncertainty (kcal/mol)"])
for ligA, DG, unc_DG in MLEs:
DG, unc_DG = dp2(DG), dp2(unc_DG)
output.write(f'DGbind({ligA})\tDG(MLE)\tZero\t{ligA}\t{DG}\t{unc_DG}\n')
DG, unc_DG = format_estimate_uncertainty(DG, unc_DG)
writer.writerow([ligA, DG, unc_DG])

# 5b) write out DDG values
for ligA, ligB, DDGbind, bind_unc, DDGhyd, hyd_unc in DDGs:
name = f"{ligB}, {ligA}"
if DDGbind is not None:
DDGbind, bind_unc = dp2(DDGbind), dp2(bind_unc)
output.write(f'DDGbind({name})\tRBFE\t{ligA}\t{ligB}'
f'\t{DDGbind}\t{bind_unc}\n')
if DDGhyd is not None:
DDGhyd, hyd_unc = dp2(DDGhyd), dp2(hyd_unc)
output.write(f'DDGhyd({name})\tRHFE\t{ligA}\t{ligB}\t'
f'{DDGhyd}\t{hyd_unc}\n')

@click.command(
'gather',
short_help="Gather result jsons for network of RFE results into a TSV file"
)
@click.argument('rootdir',
type=click.Path(dir_okay=True, file_okay=False,
path_type=pathlib.Path),
required=True)
@click.option(
'--report',
type=HyphenAwareChoice(['dg', 'ddg', 'dg-raw'], case_sensitive=False),
default="dg", show_default=True,
help=(
"What data to report. 'dg' gives maximum-likelihood estimate of "
"absolute deltaG, 'ddg' gives delta-delta-G, and 'dg-raw' gives "
"the raw result of the deltaG for a leg."
)
)
@click.option('output', '-o',
type=click.File(mode='w'),
default='-')
def gather(rootdir, output, report):
"""Gather simulation result jsons of relative calculations to a tsv file

This walks ROOTDIR recursively and finds all result JSON files from the
quickrun command (these files must end in .json). Each of these contains
the results of a separate leg from a relative free energy thermodynamic
cycle.

The results reported depend on ``--report`` flag:

\b
* 'dg' (default) reports the ligand and the results are the maximum
likelihood estimate of its absolute free, and the uncertainty in
that.
* 'ddg' reports pairs of ligand_i and ligand_j, the calculated
relative free energy DDG(i->j) = DG(j) - DG(i) and its uncertainty
* 'dg-raw' reports the raw results, giving the leg (vacuum, solvent, or
complex), ligand_i, ligand_j, the raw DG(i->j) associated with it.

The output is a table of **tab** separated values. By default, this
outputs to stdout, use the -o option to choose an output file.
"""
from collections import defaultdict
import glob
import csv

# 1) find all possible jsons
json_fns = glob.glob(str(rootdir) + '/**/*json', recursive=True)

# 2) filter only result jsons
result_fns = filter(is_results_json, json_fns)

# 3) pair legs of simulations together into dict of dicts
legs = defaultdict(dict)

for result_fn in result_fns:
result = load_results(result_fn)
if result is None:
continue
elif result['estimate'] is None or result['uncertainty'] is None:
click.echo(f"WARNING: Calculations for {result_fn} did not finish succesfully!",
err=True)

try:
names = get_names(result)
except KeyError:
raise ValueError("Failed to guess names")
try:
simtype = get_type(result)
except KeyError:
simtype = legacy_get_type(result_fn)

legs[names][simtype] = result['estimate'], result['uncertainty']

writer = csv.writer(
output,
delimiter="\t",
lineterminator="\n", # to exactly reproduce previous, prefer "\r\n"
)

# 5a) write out MLE values
# 5b) write out DDG values
# 5c) write out each leg
for ligpair, vals in sorted(legs.items()):
name = ', '.join(ligpair)
for simtype, (m, u) in sorted(vals.items()):
if m is None:
m, u = 'NaN', 'NaN'
else:
m, u = dp2(m.m), dp2(u.m)
output.write(f'DG{simtype}({name})\t{simtype}\t{ligpair[0]}\t'
f'{ligpair[1]}\t{m}\t{u}\n')
writing_func = {
'dg': _write_dg_mle,
'ddg': _write_ddg,
'dg-raw': _write_dg_raw,
}[report.lower()]
writing_func(legs, writer)


PLUGIN = OFECommandPlugin(
command=gather,
section='Quickrun Executor',
requires_ofe=(0, 6),
)

if __name__ == "__main__":
gather()
19 changes: 19 additions & 0 deletions openfecli/tests/clicktypes/test_hyphenchoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pytest
import click
from openfecli.clicktypes.hyphenchoice import HyphenAwareChoice

class TestHyphenAwareChoice:
@pytest.mark.parametrize('value', [
"foo_bar_baz", "foo_bar-baz", "foo-bar_baz", "foo-bar-baz"
])
def test_init(self, value):
ch = HyphenAwareChoice([value])
assert ch.choices == ["foo-bar-baz"]

@pytest.mark.parametrize('value', [
"foo_bar_baz", "foo_bar-baz", "foo-bar_baz", "foo-bar-baz"
])
def test_convert(self, value):
ch = HyphenAwareChoice(['foo-bar-baz'])
# counting on __call__ to get to convert()
assert ch(value) == "foo-bar-baz"
19 changes: 18 additions & 1 deletion openfecli/tests/commands/test_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,24 @@
import tarfile
import pytest

from openfecli.commands.gather import gather
from openfecli.commands.gather import (
gather, format_estimate_uncertainty, _get_column
)

@pytest.mark.parametrize('est,unc,unc_prec,est_str,unc_str', [
(12.432, 0.111, 2, "12.43", "0.11"),
(0.9999, 0.01, 2, "1.000", "0.010"),
(1234, 100, 2, "1230", "100"),
])
def test_format_estimate_uncertainty(est, unc, unc_prec, est_str, unc_str):
assert format_estimate_uncertainty(est, unc, unc_prec) == (est_str, unc_str)

@pytest.mark.parametrize('val, col', [
(1.0, 1), (0.1, -1), (-0.0, 0), (0.0, 0), (0.2, -1), (0.9, -1),
(0.011, -2), (9, 1), (10, 2), (15, 2),
])
def test_get_column(val, col):
assert _get_column(val) == col


@pytest.fixture
Expand Down