Guide exploration along honest policy

pkel · Apr 21, 2024 · 7edb89b · 7edb89b
1 parent ecce520
commit 7edb89b
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 7 deletions.
diff --git a/mdp/aft20barzur.py b/mdp/aft20barzur.py
@@ -79,6 +79,12 @@ def actions(self, s: BState) -> list[Action]:
         actions.append(ADOPT)
         return actions
 
+    def honest(self, s: BState) -> list[Action]:
+        if s.a > s.h:
+            return OVERRIDE
+        else:
+            return ADOPT
+
     def apply_wait(self, s: BState) -> list[Transition]:
         t = []
         if s.fork != ACTIVE:

diff --git a/mdp/mcvi_test.py b/mdp/mcvi_test.py
@@ -11,7 +11,7 @@
 
 
 def mcvi(model, *args, horizon=100, steps=10000, eps=0.1, report_steps=None, **kwargs):
-    agent = MCVI(model, eps=eps, horizon=horizon)
+    agent = MCVI(model, eps=eps, horizon=horizon, **kwargs)
 
     j = 0
     for i in range(steps):
@@ -54,8 +54,14 @@ def test_mcvi(*args, **kwargs):
 
 
 if __name__ == "__main__":
-    model = SelfishMining(
-        Bitcoin(), alpha=0.30, gamma=1, maximum_size=20, merge_isomorphic=False
+    problem = dict(alpha=0.30, gamma=0.8)
+
+    model_a = SelfishMining(
+        Bitcoin(), **problem, maximum_size=20, merge_isomorphic=False
+    )
+    # mcvi(model_a, steps=1000000, report_steps=50, horizon=30, eps = 0.1, eps_honest = 0.1)
+
+    model_b = aft20barzur.BitcoinSM(**problem, maximum_fork_length=10000)
+    mcvi(
+        model_b, steps=1000000, report_steps=10000, horizon=30, eps=0.1, eps_honest=0.1
     )
-    #  model = aft20barzur.BitcoinSM(alpha=0.30, gamma=1, maximum_fork_length=10000)
-    mcvi(model, steps=1000000, report_steps=50, horizon=30)
diff --git a/mdp/model.py b/mdp/model.py
@@ -44,3 +44,9 @@ def apply(self, a: Action, s: State) -> list[Transition]:
         Define state transitions. Action a is applied to state s.
         """
         raise NotImplementedError
+
+    def honest(self, s: State) -> Action:
+        """
+        What would an honest participant do?
+        """
+        raise NotImplementedError
diff --git a/mdp/monte_carlo_value_iteration.py b/mdp/monte_carlo_value_iteration.py
@@ -13,13 +13,17 @@ def sample(lst, p: lambda x: x[0]):
 
 
 class MCVI:
-    def __init__(self, model: Model, *args, horizon: int, eps: float):
+    def __init__(
+        self, model: Model, *args, horizon: int, eps: float, eps_honest: float = 0
+    ):
         assert 0 < eps < 1
+        assert 0 <= eps_honest < 1
         assert horizon > 0
 
         self.model = model
         self.horizon = horizon
         self.eps = eps
+        self.eps_honest = eps_honest
 
         self.state = None  # current model state
         self.state_id = None  # current integer state
@@ -114,9 +118,14 @@ def step(self):
 
         # epsilon greedy policy
         i = max_i
-        if random.random() < self.eps:
+        x = random.random()
+        if x < self.eps:
             # explore randomly
             i = random.randrange(n)
+        elif x < self.eps + self.eps_honest:
+            # explore along honest policy
+            a = self.model.honest(state)
+            i = actions.index(a)
 
         # apply action & transition
         to = sample(action_transitions[i], lambda x: x.probability)

diff --git a/mdp/sm.py b/mdp/sm.py
@@ -758,6 +758,17 @@ def actions(self, s: State) -> list[Action]:
 
         return actions
 
+    def honest(self, s: State) -> Action:
+        e = self.editor
+        e.load(s)
+
+        # honest policy: release then consider then continue
+        if len(e.to_release()) > 0:
+            return Release(0)
+        if len(e.to_consider()) > 0:
+            return Consider(0)
+        return Continue()
+
     def apply(self, a: Action, s: State) -> list[Transition]:
         if isinstance(a, Release):
             return self.apply_release(a.i, s)