More JOSS feedback (#142)

* footnote about local install * example adaptive usage * misc edits (better tests, faster Dockerfile, ...)
stsievert · Apr 14, 2023 · fb3fde1 · fb3fde1
1 parent 452ac94
commit fb3fde1
Show file tree

Hide file tree

Showing 9 changed files with 98 additions and 116 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -13,54 +13,62 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-    - name: Setup Python 3.7
-      uses: actions/setup-python@v2
+    - name: Install micromamba
+      uses: mamba-org/provision-with-micromamba@main
       with:
-        python-version: 3.7
-    - name: Add conda to system path
-      run: |
-        # $CONDA is an environment variable pointing to the root of the miniconda directory
-        echo $CONDA/bin >> $GITHUB_PATH
+        environment-file: salmon.lock.yml
     - name: Debug info
+      shell: bash -l {0}
       run: |
-          echo "Running pwd..."
-          pwd
-          echo "\nRunning echo $USER..."
-          echo $USER
-          echo "\nRunning chown..."
-          sudo chown -R -H -L $USER:$USER  /home/runner/work/salmon/
-          chown -R -H -L $USER:$USER /home/runner/work/salmon/
-          echo "\nRunning chmod..."
-          sudo chmod -R 777 /home/runner/work/salmon/
-          echo "\nRunning `ls`..."
-          ls
+        echo "    which {python, pip, pytest}"
+        which python
+        which pip
+        echo "    sudo dollar sigh which {python, pip, pytest}"
+        sudo echo $(which python)
+        sudo echo $(which pip)
+        sudo echo $(which pytest)
+        echo "    sudo which {python, pip, pytest}"
+        sudo which python
+        sudo which pip
+        echo "    Running pwd..."
+        pwd
+        echo "    Running echo $USER..."
+        echo $USER
+        echo "    Running chown..."
+        sudo chown -R -H -L $USER:$USER  /home/runner/work/salmon/
+        chown -R -H -L $USER:$USER /home/runner/work/salmon/
+        echo "    Running chmod..."
+        sudo chmod -R 777 /home/runner/work/salmon/
+        echo "    Running `ls`..."
+        ls
     - name: Prepare for docker build...
+      shell: bash -l {0}
       run: |
-          chmod +x launch.sh
-          # chown -R -H -L $USER:$USER .
-          sudo chown -R -H -L $USER:$USER .
-          sudo chown -R -H -L $USER:$USER salmon docs tests
-    - name: Install dependencies
-      run: |
-        conda env update --file salmon.lock.yml --name base
+        chmod +x launch.sh
+        # chown -R -H -L $USER:$USER .
+        sudo chown -R -H -L $USER:$USER .
+        sudo chown -R -H -L $USER:$USER salmon docs tests
     - name: Install Salmon
+      shell: bash -l {0}
       run: |
-        pip install .
-        pip install pytest
+        sudo $(which pip) install .
+        sudo $(which pip) install pytest
     - name: Run tests in salmon/salmon/
-      run: sudo /usr/share/miniconda/bin/pytest salmon/
+      shell: bash -l {0}
+      run: |
+        sudo $(which pytest) salmon/
     - uses: docker/setup-buildx-action@v1
       with:
         driver: docker
     - name: Build Salmon server w/ Docker
       env:
         SALMON_NO_AUTH: 1
       run: |
-          sudo --preserve-env=SALMON_NO_AUTH docker-compose up &
-          until curl 127.0.0.1:8421 > /dev/null 2>&1; do :; done  # wait for container to start
-          sudo docker ps
+        sudo --preserve-env=SALMON_NO_AUTH docker-compose up &
+        until curl 127.0.0.1:8421 > /dev/null 2>&1; do :; done  # wait for container to start
+        sudo docker ps
     - name: Run all tests
+      shell: bash -l {0}
       run: |
-          # sudo docker-compose logs -f &  # if debugging; shows logs
-          # sudo /usr/share/miniconda/bin/pytest -s
-          sudo /usr/share/miniconda/bin/pytest
+        # sudo docker-compose logs -f &  # if debugging; shows logs
+        sudo $(which pytest)
diff --git a/.github/workflows/test_conda.yml b/.github/workflows/test_conda.yml
diff --git a/paper/paper.md b/paper/paper.md
@@ -9,11 +9,17 @@ authors:
   - name: Scott Sievert
     orcid: 0000-0002-4275-3452
     affiliation: 1
+  - name: Robert Nowak
+    affiliation: 1
+  - name: Timothy Rogers
+    orcid: 0000-0001-6304-755X
+    affiliation: 1
 affiliations:
  - name: University of Wisconsin--Madison
    index: 1
-date: 09 April 2022
+date: 11 March 2023
 bibliography: paper.bib
+draft: true
 ---
 
 # Summary
@@ -27,7 +33,7 @@ choices on query selection (aka active machine learning or adaptive sampling) wh
 collecting relative
 similarity judgments from crowdsourcing participants. Salmon is usable by experimentalists
 because it requires little to no programming experience and only requires an
-Amazon AWS account for launching. Extensive simulations and experiments suggest
+Amazon AWS account for launching (though a local install is available). Extensive simulations and experiments suggest
 that Salmon requires 2 to 3 times fewer response than random sampling.
 
 # Statement of need
@@ -41,7 +47,7 @@ Typically, experimentalists require an inordinate number of human responses (abo
 10,000) to produce an accurate embedding when making a similarity map in
 $d=2$ dimensions of $n = 50$ chemistry molecules [@chem].
 The number of human responses required will scale like
-$\mathcal{O}(nd\log n)$, which means that asking about $n=100$ molecules for $d=3$ dimensions will require about 35,000 responses.
+$\mathcal{O}(nd\log n)$, which means that asking about $n=100$ molecules for $d=3$ dimensions will likely require about 35,000 responses.
 
 Many "active machine learning" methods have been proposed to reduce the number
 of queries required [@ckl; @ste]. These show gains, at least offline when
@@ -63,7 +69,7 @@ the underlying noise model. With a naive computation, scoring a single query req
 floating point operations (FLOPs), and the embedding typically requires significant
 computation [@soe; @ma2019fast], though some work has been done to reduce the amount of computation [@erkle].
 
-# Salmon
+# Design goals
 
 Salmon's main design goals are below:
 
@@ -91,28 +97,28 @@ framework, PyTorch [@pytorch]. This allows for easy customization of the
 underlying optimization method during both online and offline computation, including by the experimentalist managing
 Salmon if so desired.
 
-Goal (3) is enabled by a relatively simple launch through Amazon AWS using Amazon Machine Images (AMIs). The AMI for Salmon[^ami] 
+Goal (3) is enabled by a relatively simple launch through Amazon AWS using Amazon Machine Images (AMIs).[^local] The AMI for Salmon[^ami]
 pulls the latest release of Salmon from GitHub and then launches Salmon. After some other tasks (e.g., opening ports, etc), Salmon is ready be launched. Salmon requires fairly minimal computational resources; all the experiments and simulation were performed with `t3.xlarge` Amazon EC2 instance, which has 4 cores, 16GB of memory and costs about $3.98 per day.
 
 After launch, Salmon can start an experiment with stimuli consisting of text, images, video or HTML strings. It provides a mechanism to monitor an ongoing experiment, which includes the following information:
 
-* Basic experiment statistics (e.g., number of unique users, launch date)
-* Server performance (e.g., processing time for different endpoints, rate responses received)
-* Crowdsourcing participant experience (e.g., new query latency)
-* Embedding visualization
-* List of targets.
+* **Basic experiment statistics:** number of unique users, launch date, etc.
+* **Server performance:** processing time for different endpoints, rate responses received, etc.
+* **Client timings,** including response and new query latency.
+* **Embedding visualization** and a list of targets in the embedding.
 
 In addition, Salmon provides links to download the responses and configuration. Salmon also supports experiment persistence through downloading and uploading experiments.
 The embedding that Salmon generates can be downloaded, at least if active samplers are used. Regardless of the sampler used, Salmon can be used to generate the embeddings offline from the downloaded responses.
 
+[^local]:A local install is available, and only requires Docker. Collection of crowdsourced responses will require running a web server or collecting in-person responses (though a local install may be useful for development).
 [^ami]:Details are at [https://docs.stsievert.com/salmon/installation][in]
 
 [in]:https://docs.stsievert.com/salmon/installation
 
 # Uses
 
-Salmon has been used by several groups, including psychologists at UW--Madison,
-and Louisiana State University.
+Salmon has been used by several groups, including psychologists at the
+University of Wisconsin--Madison and the Louisiana State University.
 
 # Acknowledgments
 

diff --git a/salmon.yml b/salmon.yml
@@ -36,7 +36,7 @@ dependencies:
     - redis==3.5.*  # https://github.com/RedisJSON/redisjson-py/issues/67
     - matplotlib
     - gunicorn
-    - python-multipart  # optional dep required by gunicorn
+    - python-multipart  # optional dep required by gunicorn for HTML forms
     - numpydoc
     - pytest
     - jupyter-server-proxy  # to view Dask dashboard

diff --git a/salmon/triplets/samplers/_adaptive_runners.py b/salmon/triplets/samplers/_adaptive_runners.py
@@ -293,6 +293,8 @@ def score(self, X, y, embedding=None):
 
         """
         y_hat = self.predict(X, embedding=embedding)
+        y = np.asarray(y)
+        y_hat = np.asarray(y_hat)
         return (y_hat == y).mean()
 
 

diff --git a/salmon/triplets/samplers/adaptive/search/tests/test_gram_utils.py b/salmon/triplets/samplers/adaptive/search/tests/test_gram_utils.py
@@ -46,23 +46,22 @@ def test_project_and_is_psd(n, d, seed=None):
 
 def test_project_changes_torch():
     n, d, seed = 20, 2, None
-    rng = check_random_state(seed)
     X = torch.randn(n, d)
 
     G = gram_utils.gram_matrix(X.numpy())
     lamduhs, vecs = LA.eigh(G)
     lamduhs[0] = -1
     G = vecs.T @ np.diag(lamduhs) @ vecs
     G = torch.from_numpy(G)
-    e, v = torch.symeig(G)
+    e, v = torch.linalg.eigh(G, UPLO="U")
     assert e.min().item() < -0.5
 
     before = G.numpy().copy()
     after = gram_utils.onto_psd(G.numpy(), out=G.numpy())
 
     assert not np.allclose(before, after)
     assert not torch.allclose(torch.from_numpy(before), G)
-    e, v = torch.symeig(G)
+    e, v = torch.linalg.eigh(G, UPLO="U")
     assert e.min() > -0.25
 
 

diff --git a/tests/test_offline.py b/tests/test_offline.py
@@ -1,5 +1,7 @@
 from pathlib import Path
 import yaml
+import random
+from typing import Dict
 
 import numpy as np
 import numpy.linalg as LA
@@ -11,6 +13,7 @@
 from salmon.triplets.samplers import TSTE
 import salmon.triplets.offline
 
+ArrayLike = np.ndarray
 
 def test_salmon_import():
     """This test makes sure that no errors are raised on import
@@ -130,6 +133,34 @@ def test_offline_names_correct():
     assert (em["target"] == config["targets"]).all()
 
 
+def _answer(q: Dict[str, int], X: ArrayLike) -> int:
+    h = X[q["head"]]
+    l = X[q["left"]]
+    r = X[q["right"]]
+    if LA.norm(h - l) < LA.norm(h - r):
+        return q["left"]
+    return q["right"]
+
+def test_offline_adaptive(n=10, d=1):
+    rng = np.random.RandomState(42)
+    X = rng.uniform(size=(n, d))
+    val_queries = np.asarray([rng.choice(n, size=3, replace=False) for _ in range(5000)])
+    val_ans = np.asarray([0 if LA.norm(X[h] - X[l]) < LA.norm(X[h] - X[r]) else 1 for h, l, r in val_queries])
+
+    sampler = TSTE(n=n, d=d, alpha=1, R=1)
+    score0 = sampler.score(val_queries, val_ans)
+    for t in range(20):
+        queries, scores, _ = sampler.get_queries()
+        _good_queries = queries[np.argsort(scores)[-4:]]
+        good_queries = [{"head": h, "left": o1, "right": o2} for h, o1, o2 in _good_queries]
+        answers = [{"winner": _answer(q, X), **q} for q in good_queries]
+        sampler.process_answers(answers)
+
+    scorem1 = sampler.score(val_queries, val_ans)
+    assert 0 <= score0 <= scorem1 <= 1
+    assert score0 + 0.1 < scorem1, "Improves by at least 10% after 200 answers"
+
+
 if __name__ == "__main__":
     test_offline_init()
     test_offline_embedding_random_state()

diff --git a/tests/test_passive.py b/tests/test_passive.py
@@ -31,7 +31,7 @@ def test_validation_sampling(server, logs):
     data = []
     puid = "adsfjkl4awjklra"
 
-    n_repeat = 3
+    n_repeat = 4
     server.authorize()
     server.post("/init_exp", data={"exp": exp})
     Q = []

diff --git a/tests/test_validation.py b/tests/test_validation.py
@@ -28,7 +28,7 @@ def test_validation_sampling(server, logs):
     data = []
     puid = "adsfjkl4awjklra"
 
-    n_repeat = 3
+    n_repeat = 4
     server.authorize()
     server.post("/init_exp", data={"exp": exp})
     Q = []