From fb3fde1ea512145de613ba4eb9ff651a617f6f8b Mon Sep 17 00:00:00 2001 From: Scott Sievert Date: Thu, 13 Apr 2023 22:04:27 -0400 Subject: [PATCH] More JOSS feedback (#142) * footnote about local install * example adaptive usage * misc edits (better tests, faster Dockerfile, ...) --- .github/workflows/test.yml | 76 ++++++++++--------- .github/workflows/test_conda.yml | 64 ---------------- paper/paper.md | 30 +++++--- salmon.yml | 2 +- salmon/triplets/samplers/_adaptive_runners.py | 2 + .../adaptive/search/tests/test_gram_utils.py | 5 +- tests/test_offline.py | 31 ++++++++ tests/test_passive.py | 2 +- tests/test_validation.py | 2 +- 9 files changed, 98 insertions(+), 116 deletions(-) delete mode 100644 .github/workflows/test_conda.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a913a9f0..fbcde9b9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,42 +13,50 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Setup Python 3.7 - uses: actions/setup-python@v2 + - name: Install micromamba + uses: mamba-org/provision-with-micromamba@main with: - python-version: 3.7 - - name: Add conda to system path - run: | - # $CONDA is an environment variable pointing to the root of the miniconda directory - echo $CONDA/bin >> $GITHUB_PATH + environment-file: salmon.lock.yml - name: Debug info + shell: bash -l {0} run: | - echo "Running pwd..." - pwd - echo "\nRunning echo $USER..." - echo $USER - echo "\nRunning chown..." - sudo chown -R -H -L $USER:$USER /home/runner/work/salmon/ - chown -R -H -L $USER:$USER /home/runner/work/salmon/ - echo "\nRunning chmod..." - sudo chmod -R 777 /home/runner/work/salmon/ - echo "\nRunning `ls`..." - ls + echo " which {python, pip, pytest}" + which python + which pip + echo " sudo dollar sigh which {python, pip, pytest}" + sudo echo $(which python) + sudo echo $(which pip) + sudo echo $(which pytest) + echo " sudo which {python, pip, pytest}" + sudo which python + sudo which pip + echo " Running pwd..." + pwd + echo " Running echo $USER..." + echo $USER + echo " Running chown..." + sudo chown -R -H -L $USER:$USER /home/runner/work/salmon/ + chown -R -H -L $USER:$USER /home/runner/work/salmon/ + echo " Running chmod..." + sudo chmod -R 777 /home/runner/work/salmon/ + echo " Running `ls`..." + ls - name: Prepare for docker build... + shell: bash -l {0} run: | - chmod +x launch.sh - # chown -R -H -L $USER:$USER . - sudo chown -R -H -L $USER:$USER . - sudo chown -R -H -L $USER:$USER salmon docs tests - - name: Install dependencies - run: | - conda env update --file salmon.lock.yml --name base + chmod +x launch.sh + # chown -R -H -L $USER:$USER . + sudo chown -R -H -L $USER:$USER . + sudo chown -R -H -L $USER:$USER salmon docs tests - name: Install Salmon + shell: bash -l {0} run: | - pip install . - pip install pytest + sudo $(which pip) install . + sudo $(which pip) install pytest - name: Run tests in salmon/salmon/ - run: sudo /usr/share/miniconda/bin/pytest salmon/ + shell: bash -l {0} + run: | + sudo $(which pytest) salmon/ - uses: docker/setup-buildx-action@v1 with: driver: docker @@ -56,11 +64,11 @@ jobs: env: SALMON_NO_AUTH: 1 run: | - sudo --preserve-env=SALMON_NO_AUTH docker-compose up & - until curl 127.0.0.1:8421 > /dev/null 2>&1; do :; done # wait for container to start - sudo docker ps + sudo --preserve-env=SALMON_NO_AUTH docker-compose up & + until curl 127.0.0.1:8421 > /dev/null 2>&1; do :; done # wait for container to start + sudo docker ps - name: Run all tests + shell: bash -l {0} run: | - # sudo docker-compose logs -f & # if debugging; shows logs - # sudo /usr/share/miniconda/bin/pytest -s - sudo /usr/share/miniconda/bin/pytest + # sudo docker-compose logs -f & # if debugging; shows logs + sudo $(which pytest) diff --git a/.github/workflows/test_conda.yml b/.github/workflows/test_conda.yml deleted file mode 100644 index 34a3823a..00000000 --- a/.github/workflows/test_conda.yml +++ /dev/null @@ -1,64 +0,0 @@ -name: Tests (install w/ conda) - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -jobs: - - build: - - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Setup Python 3.7 - uses: actions/setup-python@v2 - with: - python-version: 3.7 - - name: Add conda to system path - run: | - # $CONDA is an environment variable pointing to the root of the miniconda directory - echo $CONDA/bin >> $GITHUB_PATH - - name: Debug info - run: | - echo "Running pwd..." - pwd - echo "\nRunning echo $USER..." - echo $USER - echo "\nRunning chown..." - sudo chown -R -H -L $USER:$USER /home/runner/work/salmon/ - chown -R -H -L $USER:$USER /home/runner/work/salmon/ - echo "\nRunning chmod..." - sudo chmod -R 777 /home/runner/work/salmon/ - echo "\nRunning `ls`..." - ls - - name: Prepare for docker build... - run: | - chmod +x launch.sh - # chown -R -H -L $USER:$USER . - sudo chown -R -H -L $USER:$USER . - sudo chown -R -H -L $USER:$USER salmon docs tests - - name: Install dependencies - run: | - conda env update --file salmon.yml --name base - - name: Install Salmon - run: pip install -e . - - name: Run tests in salmon/salmon/ - run: sudo /usr/share/miniconda/bin/pytest salmon/ - - uses: docker/setup-buildx-action@v1 - with: - driver: docker - - name: Build Salmon server w/ Docker - env: - SALMON_NO_AUTH: 1 - run: | - sudo --preserve-env=SALMON_NO_AUTH docker-compose up & - until curl 127.0.0.1:8421 > /dev/null 2>&1; do :; done # wait for container to start - sudo docker ps - - name: Run all tests - run: | - # sudo docker-compose logs -f & # if debugging; shows logs - # sudo /usr/share/miniconda/bin/pytest -s - sudo /usr/share/miniconda/bin/pytest diff --git a/paper/paper.md b/paper/paper.md index d99ef3e1..fd168d0a 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -9,11 +9,17 @@ authors: - name: Scott Sievert orcid: 0000-0002-4275-3452 affiliation: 1 + - name: Robert Nowak + affiliation: 1 + - name: Timothy Rogers + orcid: 0000-0001-6304-755X + affiliation: 1 affiliations: - name: University of Wisconsin--Madison index: 1 -date: 09 April 2022 +date: 11 March 2023 bibliography: paper.bib +draft: true --- # Summary @@ -27,7 +33,7 @@ choices on query selection (aka active machine learning or adaptive sampling) wh collecting relative similarity judgments from crowdsourcing participants. Salmon is usable by experimentalists because it requires little to no programming experience and only requires an -Amazon AWS account for launching. Extensive simulations and experiments suggest +Amazon AWS account for launching (though a local install is available). Extensive simulations and experiments suggest that Salmon requires 2 to 3 times fewer response than random sampling. # Statement of need @@ -41,7 +47,7 @@ Typically, experimentalists require an inordinate number of human responses (abo 10,000) to produce an accurate embedding when making a similarity map in $d=2$ dimensions of $n = 50$ chemistry molecules [@chem]. The number of human responses required will scale like -$\mathcal{O}(nd\log n)$, which means that asking about $n=100$ molecules for $d=3$ dimensions will require about 35,000 responses. +$\mathcal{O}(nd\log n)$, which means that asking about $n=100$ molecules for $d=3$ dimensions will likely require about 35,000 responses. Many "active machine learning" methods have been proposed to reduce the number of queries required [@ckl; @ste]. These show gains, at least offline when @@ -63,7 +69,7 @@ the underlying noise model. With a naive computation, scoring a single query req floating point operations (FLOPs), and the embedding typically requires significant computation [@soe; @ma2019fast], though some work has been done to reduce the amount of computation [@erkle]. -# Salmon +# Design goals Salmon's main design goals are below: @@ -91,28 +97,28 @@ framework, PyTorch [@pytorch]. This allows for easy customization of the underlying optimization method during both online and offline computation, including by the experimentalist managing Salmon if so desired. -Goal (3) is enabled by a relatively simple launch through Amazon AWS using Amazon Machine Images (AMIs). The AMI for Salmon[^ami] +Goal (3) is enabled by a relatively simple launch through Amazon AWS using Amazon Machine Images (AMIs).[^local] The AMI for Salmon[^ami] pulls the latest release of Salmon from GitHub and then launches Salmon. After some other tasks (e.g., opening ports, etc), Salmon is ready be launched. Salmon requires fairly minimal computational resources; all the experiments and simulation were performed with `t3.xlarge` Amazon EC2 instance, which has 4 cores, 16GB of memory and costs about $3.98 per day. After launch, Salmon can start an experiment with stimuli consisting of text, images, video or HTML strings. It provides a mechanism to monitor an ongoing experiment, which includes the following information: -* Basic experiment statistics (e.g., number of unique users, launch date) -* Server performance (e.g., processing time for different endpoints, rate responses received) -* Crowdsourcing participant experience (e.g., new query latency) -* Embedding visualization -* List of targets. +* **Basic experiment statistics:** number of unique users, launch date, etc. +* **Server performance:** processing time for different endpoints, rate responses received, etc. +* **Client timings,** including response and new query latency. +* **Embedding visualization** and a list of targets in the embedding. In addition, Salmon provides links to download the responses and configuration. Salmon also supports experiment persistence through downloading and uploading experiments. The embedding that Salmon generates can be downloaded, at least if active samplers are used. Regardless of the sampler used, Salmon can be used to generate the embeddings offline from the downloaded responses. +[^local]:A local install is available, and only requires Docker. Collection of crowdsourced responses will require running a web server or collecting in-person responses (though a local install may be useful for development). [^ami]:Details are at [https://docs.stsievert.com/salmon/installation][in] [in]:https://docs.stsievert.com/salmon/installation # Uses -Salmon has been used by several groups, including psychologists at UW--Madison, -and Louisiana State University. +Salmon has been used by several groups, including psychologists at the +University of Wisconsin--Madison and the Louisiana State University. # Acknowledgments diff --git a/salmon.yml b/salmon.yml index a48ebd7b..2814e013 100644 --- a/salmon.yml +++ b/salmon.yml @@ -36,7 +36,7 @@ dependencies: - redis==3.5.* # https://github.com/RedisJSON/redisjson-py/issues/67 - matplotlib - gunicorn - - python-multipart # optional dep required by gunicorn + - python-multipart # optional dep required by gunicorn for HTML forms - numpydoc - pytest - jupyter-server-proxy # to view Dask dashboard diff --git a/salmon/triplets/samplers/_adaptive_runners.py b/salmon/triplets/samplers/_adaptive_runners.py index 83db50ba..a5651f86 100644 --- a/salmon/triplets/samplers/_adaptive_runners.py +++ b/salmon/triplets/samplers/_adaptive_runners.py @@ -293,6 +293,8 @@ def score(self, X, y, embedding=None): """ y_hat = self.predict(X, embedding=embedding) + y = np.asarray(y) + y_hat = np.asarray(y_hat) return (y_hat == y).mean() diff --git a/salmon/triplets/samplers/adaptive/search/tests/test_gram_utils.py b/salmon/triplets/samplers/adaptive/search/tests/test_gram_utils.py index fc50ae08..2b39178c 100644 --- a/salmon/triplets/samplers/adaptive/search/tests/test_gram_utils.py +++ b/salmon/triplets/samplers/adaptive/search/tests/test_gram_utils.py @@ -46,7 +46,6 @@ def test_project_and_is_psd(n, d, seed=None): def test_project_changes_torch(): n, d, seed = 20, 2, None - rng = check_random_state(seed) X = torch.randn(n, d) G = gram_utils.gram_matrix(X.numpy()) @@ -54,7 +53,7 @@ def test_project_changes_torch(): lamduhs[0] = -1 G = vecs.T @ np.diag(lamduhs) @ vecs G = torch.from_numpy(G) - e, v = torch.symeig(G) + e, v = torch.linalg.eigh(G, UPLO="U") assert e.min().item() < -0.5 before = G.numpy().copy() @@ -62,7 +61,7 @@ def test_project_changes_torch(): assert not np.allclose(before, after) assert not torch.allclose(torch.from_numpy(before), G) - e, v = torch.symeig(G) + e, v = torch.linalg.eigh(G, UPLO="U") assert e.min() > -0.25 diff --git a/tests/test_offline.py b/tests/test_offline.py index b8dc0cc0..1a645c80 100644 --- a/tests/test_offline.py +++ b/tests/test_offline.py @@ -1,5 +1,7 @@ from pathlib import Path import yaml +import random +from typing import Dict import numpy as np import numpy.linalg as LA @@ -11,6 +13,7 @@ from salmon.triplets.samplers import TSTE import salmon.triplets.offline +ArrayLike = np.ndarray def test_salmon_import(): """This test makes sure that no errors are raised on import @@ -130,6 +133,34 @@ def test_offline_names_correct(): assert (em["target"] == config["targets"]).all() +def _answer(q: Dict[str, int], X: ArrayLike) -> int: + h = X[q["head"]] + l = X[q["left"]] + r = X[q["right"]] + if LA.norm(h - l) < LA.norm(h - r): + return q["left"] + return q["right"] + +def test_offline_adaptive(n=10, d=1): + rng = np.random.RandomState(42) + X = rng.uniform(size=(n, d)) + val_queries = np.asarray([rng.choice(n, size=3, replace=False) for _ in range(5000)]) + val_ans = np.asarray([0 if LA.norm(X[h] - X[l]) < LA.norm(X[h] - X[r]) else 1 for h, l, r in val_queries]) + + sampler = TSTE(n=n, d=d, alpha=1, R=1) + score0 = sampler.score(val_queries, val_ans) + for t in range(20): + queries, scores, _ = sampler.get_queries() + _good_queries = queries[np.argsort(scores)[-4:]] + good_queries = [{"head": h, "left": o1, "right": o2} for h, o1, o2 in _good_queries] + answers = [{"winner": _answer(q, X), **q} for q in good_queries] + sampler.process_answers(answers) + + scorem1 = sampler.score(val_queries, val_ans) + assert 0 <= score0 <= scorem1 <= 1 + assert score0 + 0.1 < scorem1, "Improves by at least 10% after 200 answers" + + if __name__ == "__main__": test_offline_init() test_offline_embedding_random_state() diff --git a/tests/test_passive.py b/tests/test_passive.py index b07b0dea..cb742f81 100644 --- a/tests/test_passive.py +++ b/tests/test_passive.py @@ -31,7 +31,7 @@ def test_validation_sampling(server, logs): data = [] puid = "adsfjkl4awjklra" - n_repeat = 3 + n_repeat = 4 server.authorize() server.post("/init_exp", data={"exp": exp}) Q = [] diff --git a/tests/test_validation.py b/tests/test_validation.py index 3d916962..96667e0e 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -28,7 +28,7 @@ def test_validation_sampling(server, logs): data = [] puid = "adsfjkl4awjklra" - n_repeat = 3 + n_repeat = 4 server.authorize() server.post("/init_exp", data={"exp": exp}) Q = []