From 837a97e5eb7fc96f27eb18577c9db17a091e7ff4 Mon Sep 17 00:00:00 2001
From: Patrik Keller <git@pkel.dev>
Date: Sun, 12 May 2024 16:10:26 +0200
Subject: [PATCH] Revise shutdown and initial value estimate

---
 mdp/fc16sapirshtein.py | 46 +++++++++++++++++++++++++-----------------
 mdp/measure-rtdp.py    |  8 ++++----
 mdp/model.py           | 28 +++++++++++++++++++++++--
 mdp/rtdp.py            | 26 ++++++++++++++++++------
 4 files changed, 77 insertions(+), 31 deletions(-)

diff --git a/mdp/fc16sapirshtein.py b/mdp/fc16sapirshtein.py
index abc1060a..10a44ead 100644
--- a/mdp/fc16sapirshtein.py
+++ b/mdp/fc16sapirshtein.py
@@ -177,25 +177,33 @@ def honest(self, s: BState) -> list[Action]:
             return ADOPT
 
     def shutdown(self, s: BState) -> list[Transition]:
-        # Rewards and progress are calculated on common chain. Terminating with
-        # a no-op is already fair.
-        #  return [Transition(state=s, probability=1, reward=0, progress=0)]
-        # NOTE In principle, we could do and award a full release here, but this
-        # would change the model. Maybe evaluate this separately.
-        snew = BState(a=0, h=0, fork=IRRELEVANT)
-        if s.h > s.a:
-            return [Transition(state=snew, probability=1, reward=0, progress=s.h)]
-        if s.a > s.h:
-            return [Transition(state=snew, probability=1, reward=s.a, progress=s.a)]
-        if s.a == s.h:
-            return [
-                Transition(
-                    state=snew, probability=self.gamma, reward=s.a, progress=s.a
-                ),
-                Transition(
-                    state=snew, probability=1 - self.gamma, reward=0, progress=s.h
-                ),
-            ]
+        # Abort attack in favor of attacker; go back to start.
+        ts = []
+        for snew, p in self.start():
+            if s.h > s.a:
+                ts.append(Transition(state=snew, probability=p, reward=0, progress=s.h))
+            elif s.a > s.h:
+                ts.append(
+                    Transition(state=snew, probability=p, reward=s.a, progress=s.a)
+                )
+            elif s.a == s.h:
+                ts.append(
+                    Transition(
+                        state=snew, probability=p * self.gamma, reward=s.a, progress=s.a
+                    )
+                )
+                ts.append(
+                    Transition(
+                        state=snew,
+                        probability=p * (1 - self.gamma),
+                        reward=0,
+                        progress=s.h,
+                    )
+                )
+            else:
+                raise Exception("logic error")
+        assert mdp.sum_to_one([t.probability for t in ts])
+        return ts
 
 
 mappable_params = dict(alpha=0.125, gamma=0.25)
diff --git a/mdp/measure-rtdp.py b/mdp/measure-rtdp.py
index a6ca5c98..02e58497 100644
--- a/mdp/measure-rtdp.py
+++ b/mdp/measure-rtdp.py
@@ -28,15 +28,15 @@
     dict(alpha=1 / 4, gamma=1 / 4, attacker="weak"),
     dict(alpha=1 / 3, gamma=1 / 3, attacker="intermediate"),
     dict(
-        alpha=0.42, gamma=0.82, attacker="strong"
+        alpha=0.42, gamma=0.84, attacker="strong"
     ),  # TODO double check whether we can do 1/2
 ]
 
 rows = [
     dict(row=1, protocol="bitcoin", model="fc16", trunc=40, algo="aft20", ref=1),
-    #  dict(row=2, protocol="bitcoin", model="aft20", trunc=40, algo="aft20", ref=1),
+    dict(row=2, protocol="bitcoin", model="aft20", trunc=40, algo="aft20", ref=1),
     dict(row=3, protocol="bitcoin", model="fc16", trunc=40, algo="rtdp", ref=1),
-    #  dict(row=4, protocol="bitcoin", model="aft20", trunc=40, algo="rtdp", ref=1),
+    dict(row=4, protocol="bitcoin", model="aft20", trunc=40, algo="rtdp", ref=1),
     #  dict(row=5, protocol="bitcoin", model="fc16", trunc=0, algo="rtdp", ref=1),
     #  dict(row=6, protocol="bitcoin", model="aft20", trunc=0, algo="rtdp", ref=1),
     #  dict(row=7, protocol="bitcoin", model="generic", trunc=10, algo="aft20", ref=1),
@@ -168,7 +168,7 @@ def implicit_mdp(*args, model, protocol, trunc, alpha, gamma, **kwargs):
 argp.add_argument("--rtdp_eps", type=float, default=0.2, metavar="FLOAT")
 argp.add_argument("--rtdp_es", type=float, default=0.9, metavar="FLOAT")
 argp.add_argument("--rtdp_steps", type=int, default=50_000, metavar="INT")
-argp.add_argument("--vi_delta", type=float, default=0.001, metavar="FLOAT")
+argp.add_argument("--vi_delta", type=float, default=0.01, metavar="FLOAT")
 args = argp.parse_args()
 
 # Single measurement
diff --git a/mdp/model.py b/mdp/model.py
index 65f5b20a..0b8145fc 100644
--- a/mdp/model.py
+++ b/mdp/model.py
@@ -92,6 +92,9 @@ def actions(self, state):
         else:
             return self.unwrapped.actions(state)
 
+    def continue_probability_of_progress(self, progress):
+        return (1.0 - (1.0 / self.horizon)) ** progress
+
     def apply(self, action, state):
         assert state is not self.terminal
 
@@ -101,7 +104,7 @@ def apply(self, action, state):
             if t.progress == 0.0:
                 transitions.append(t)
             else:
-                continue_p = (1.0 - (1.0 / self.horizon)) ** t.progress
+                continue_p = self.continue_probability_of_progress(t.progress)
                 assert 0 < continue_p < 1
                 # one transition for continuing
                 continue_t = Transition(
@@ -136,4 +139,25 @@ def shutdown(self, state):
         if state is self.terminal:
             return []
         else:
-            return self.unwrapped.shutdown(state)
+            ts = []
+            for t in self.unwrapped.shutdown(state):
+                continue_p = self.continue_probability_of_progress(t.progress)
+                ts.append(
+                    Transition(
+                        probability=t.probability * continue_p,
+                        state=t.state,
+                        reward=t.reward,
+                        progress=t.progress,
+                        effect=t.effect,
+                    )
+                )
+                ts.append(
+                    Transition(
+                        probability=t.probability * (1 - continue_p),
+                        state=self.terminal,
+                        reward=t.reward,
+                        progress=t.progress,
+                        effect=t.effect,
+                    )
+                )
+            return ts
diff --git a/mdp/rtdp.py b/mdp/rtdp.py
index 7d6f9655..02d26b9f 100644
--- a/mdp/rtdp.py
+++ b/mdp/rtdp.py
@@ -164,8 +164,8 @@ def step(self):
         if n_actions < 1:
             # no action available, terminal state
             self.reset()
-            assert state.value == 0
-            assert state.progress == 0
+            #  assert state.value == 0
+            #  assert state.progress == 0
             return
 
         # value iteration step:
@@ -272,7 +272,7 @@ def state_and_hash_of_full_state(self, full_state):
         else:
             state = State()
             self.states[state_hash] = state
-            state.value = self.initial_value_estimate(full_state)
+            state.value, state.progress = self.initial_value_estimate(full_state)
 
         return state, state_hash
 
@@ -283,11 +283,25 @@ def initial_value_estimate(self, full_state):
         # - guide exploration by evaluating the honest policy
         # - do a fair shutdown to get a partial estimate of the states potential
 
-        value = 0
+        v = 0.0
+        p = 0.0
         for t in self.model.shutdown(full_state):
-            value += t.probability * t.reward
+            immediate_v = t.reward
+            immediate_p = t.progress
+
+            state_hash = collision_resistant_hash(t.state)
+            if state_hash in self.states:
+                state = self.states[state_hash]
+                future_v = state.value
+                future_p = state.progress
+            else:
+                future_v = 0
+                future_p = 0
+
+            v += t.probability * (immediate_v + future_v)
+            p += t.probability * (immediate_p + future_p)
 
-        return value
+        return v, p
 
     def mdp(self):
         # The agent operates on a partially explored MDP,