From 837a97e5eb7fc96f27eb18577c9db17a091e7ff4 Mon Sep 17 00:00:00 2001 From: Patrik Keller Date: Sun, 12 May 2024 16:10:26 +0200 Subject: [PATCH] Revise shutdown and initial value estimate --- mdp/fc16sapirshtein.py | 46 +++++++++++++++++++++++++----------------- mdp/measure-rtdp.py | 8 ++++---- mdp/model.py | 28 +++++++++++++++++++++++-- mdp/rtdp.py | 26 ++++++++++++++++++------ 4 files changed, 77 insertions(+), 31 deletions(-) diff --git a/mdp/fc16sapirshtein.py b/mdp/fc16sapirshtein.py index abc1060a..10a44ead 100644 --- a/mdp/fc16sapirshtein.py +++ b/mdp/fc16sapirshtein.py @@ -177,25 +177,33 @@ def honest(self, s: BState) -> list[Action]: return ADOPT def shutdown(self, s: BState) -> list[Transition]: - # Rewards and progress are calculated on common chain. Terminating with - # a no-op is already fair. - # return [Transition(state=s, probability=1, reward=0, progress=0)] - # NOTE In principle, we could do and award a full release here, but this - # would change the model. Maybe evaluate this separately. - snew = BState(a=0, h=0, fork=IRRELEVANT) - if s.h > s.a: - return [Transition(state=snew, probability=1, reward=0, progress=s.h)] - if s.a > s.h: - return [Transition(state=snew, probability=1, reward=s.a, progress=s.a)] - if s.a == s.h: - return [ - Transition( - state=snew, probability=self.gamma, reward=s.a, progress=s.a - ), - Transition( - state=snew, probability=1 - self.gamma, reward=0, progress=s.h - ), - ] + # Abort attack in favor of attacker; go back to start. + ts = [] + for snew, p in self.start(): + if s.h > s.a: + ts.append(Transition(state=snew, probability=p, reward=0, progress=s.h)) + elif s.a > s.h: + ts.append( + Transition(state=snew, probability=p, reward=s.a, progress=s.a) + ) + elif s.a == s.h: + ts.append( + Transition( + state=snew, probability=p * self.gamma, reward=s.a, progress=s.a + ) + ) + ts.append( + Transition( + state=snew, + probability=p * (1 - self.gamma), + reward=0, + progress=s.h, + ) + ) + else: + raise Exception("logic error") + assert mdp.sum_to_one([t.probability for t in ts]) + return ts mappable_params = dict(alpha=0.125, gamma=0.25) diff --git a/mdp/measure-rtdp.py b/mdp/measure-rtdp.py index a6ca5c98..02e58497 100644 --- a/mdp/measure-rtdp.py +++ b/mdp/measure-rtdp.py @@ -28,15 +28,15 @@ dict(alpha=1 / 4, gamma=1 / 4, attacker="weak"), dict(alpha=1 / 3, gamma=1 / 3, attacker="intermediate"), dict( - alpha=0.42, gamma=0.82, attacker="strong" + alpha=0.42, gamma=0.84, attacker="strong" ), # TODO double check whether we can do 1/2 ] rows = [ dict(row=1, protocol="bitcoin", model="fc16", trunc=40, algo="aft20", ref=1), - # dict(row=2, protocol="bitcoin", model="aft20", trunc=40, algo="aft20", ref=1), + dict(row=2, protocol="bitcoin", model="aft20", trunc=40, algo="aft20", ref=1), dict(row=3, protocol="bitcoin", model="fc16", trunc=40, algo="rtdp", ref=1), - # dict(row=4, protocol="bitcoin", model="aft20", trunc=40, algo="rtdp", ref=1), + dict(row=4, protocol="bitcoin", model="aft20", trunc=40, algo="rtdp", ref=1), # dict(row=5, protocol="bitcoin", model="fc16", trunc=0, algo="rtdp", ref=1), # dict(row=6, protocol="bitcoin", model="aft20", trunc=0, algo="rtdp", ref=1), # dict(row=7, protocol="bitcoin", model="generic", trunc=10, algo="aft20", ref=1), @@ -168,7 +168,7 @@ def implicit_mdp(*args, model, protocol, trunc, alpha, gamma, **kwargs): argp.add_argument("--rtdp_eps", type=float, default=0.2, metavar="FLOAT") argp.add_argument("--rtdp_es", type=float, default=0.9, metavar="FLOAT") argp.add_argument("--rtdp_steps", type=int, default=50_000, metavar="INT") -argp.add_argument("--vi_delta", type=float, default=0.001, metavar="FLOAT") +argp.add_argument("--vi_delta", type=float, default=0.01, metavar="FLOAT") args = argp.parse_args() # Single measurement diff --git a/mdp/model.py b/mdp/model.py index 65f5b20a..0b8145fc 100644 --- a/mdp/model.py +++ b/mdp/model.py @@ -92,6 +92,9 @@ def actions(self, state): else: return self.unwrapped.actions(state) + def continue_probability_of_progress(self, progress): + return (1.0 - (1.0 / self.horizon)) ** progress + def apply(self, action, state): assert state is not self.terminal @@ -101,7 +104,7 @@ def apply(self, action, state): if t.progress == 0.0: transitions.append(t) else: - continue_p = (1.0 - (1.0 / self.horizon)) ** t.progress + continue_p = self.continue_probability_of_progress(t.progress) assert 0 < continue_p < 1 # one transition for continuing continue_t = Transition( @@ -136,4 +139,25 @@ def shutdown(self, state): if state is self.terminal: return [] else: - return self.unwrapped.shutdown(state) + ts = [] + for t in self.unwrapped.shutdown(state): + continue_p = self.continue_probability_of_progress(t.progress) + ts.append( + Transition( + probability=t.probability * continue_p, + state=t.state, + reward=t.reward, + progress=t.progress, + effect=t.effect, + ) + ) + ts.append( + Transition( + probability=t.probability * (1 - continue_p), + state=self.terminal, + reward=t.reward, + progress=t.progress, + effect=t.effect, + ) + ) + return ts diff --git a/mdp/rtdp.py b/mdp/rtdp.py index 7d6f9655..02d26b9f 100644 --- a/mdp/rtdp.py +++ b/mdp/rtdp.py @@ -164,8 +164,8 @@ def step(self): if n_actions < 1: # no action available, terminal state self.reset() - assert state.value == 0 - assert state.progress == 0 + # assert state.value == 0 + # assert state.progress == 0 return # value iteration step: @@ -272,7 +272,7 @@ def state_and_hash_of_full_state(self, full_state): else: state = State() self.states[state_hash] = state - state.value = self.initial_value_estimate(full_state) + state.value, state.progress = self.initial_value_estimate(full_state) return state, state_hash @@ -283,11 +283,25 @@ def initial_value_estimate(self, full_state): # - guide exploration by evaluating the honest policy # - do a fair shutdown to get a partial estimate of the states potential - value = 0 + v = 0.0 + p = 0.0 for t in self.model.shutdown(full_state): - value += t.probability * t.reward + immediate_v = t.reward + immediate_p = t.progress + + state_hash = collision_resistant_hash(t.state) + if state_hash in self.states: + state = self.states[state_hash] + future_v = state.value + future_p = state.progress + else: + future_v = 0 + future_p = 0 + + v += t.probability * (immediate_v + future_v) + p += t.probability * (immediate_p + future_p) - return value + return v, p def mdp(self): # The agent operates on a partially explored MDP,