From d6eb5fc53d58bc25d17f017e36334693d4339845 Mon Sep 17 00:00:00 2001 From: Patrik Keller Date: Mon, 25 Mar 2024 10:28:19 +0100 Subject: [PATCH] Tweak hyperparams I found these uncommitted changes when coming back to the project after three months. No idea, whether these hyperparameters are worth much. --- gym/cpr_gym_rs/hyperparams/dqn.yml | 35 +++++++++++++++++++++++++++++- gym/cpr_gym_rs/hyperparams/ppo.yml | 9 ++++++-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/gym/cpr_gym_rs/hyperparams/dqn.yml b/gym/cpr_gym_rs/hyperparams/dqn.yml index 0c911b7e..119af5c6 100644 --- a/gym/cpr_gym_rs/hyperparams/dqn.yml +++ b/gym/cpr_gym_rs/hyperparams/dqn.yml @@ -1,4 +1,37 @@ +default: + buffer_size: 1_000_000 # how many experiences recorded for sampling + train_freq: [4, 'step'] # how regularly to update / length of rollout + gradient_steps: 1 # how many updates per rollout (at the end of each rollout) + learning_rate: 0.0001 # weight of individual update + batch_size: 32 # how many experiences to sample for each update + learning_starts: 50_000 # steps before learning starts + gamma: 0.99 # discount factor + target_update_interval: 10_000 # how often to update target network (steps) + tau: 1.0 # weight of target network update; 1 implies hard update + exploration_fraction: 0.1 # fraction of entire training period over which the exploration rate is reduced + exploration_initial_eps: 1.0 # initial value of random action probability + exploration_final_eps: 0.05 # final value of random action probability + max_grad_norm: 10 + stats_window_size: 100 # Rollout logging: number of episodes to average episode length and reward + policy: 'MlpPolicy' + policy_kwargs: "dict(net_arch=[64, 64])" + FC16SSZwPT-v0: n_envs: 24 - n_timesteps: !!float 1e6 + n_timesteps: 50_000_000 + buffer_size: 5_000_000 # how many experiences recorded for sampling + train_freq: 10_000 # how regularly to update / length of rollout in steps + gradient_steps: 100 # how many updates per rollout (at the end of each rollout) + learning_rate: 0.001 # weight of individual update + batch_size: 500 # how many experiences to sample for each update + learning_starts: 500_000 # steps before learning starts + gamma: 1 # discount factor + target_update_interval: 10_000 # how often to update target network (steps) + tau: 0.01 # weight of target network update; 1 implies hard update + exploration_fraction: 0.1 # fraction of entire training period over which the exploration rate is reduced + exploration_initial_eps: 1.0 # initial value of random action probability + exploration_final_eps: 0.1 # final value of random action probability + max_grad_norm: 10 + stats_window_size: 10_000 # Rollout logging: number of episodes to average episode length and reward policy: 'MlpPolicy' + policy_kwargs: "dict(net_arch=[64, 64])" diff --git a/gym/cpr_gym_rs/hyperparams/ppo.yml b/gym/cpr_gym_rs/hyperparams/ppo.yml index 0c911b7e..5ce8878b 100644 --- a/gym/cpr_gym_rs/hyperparams/ppo.yml +++ b/gym/cpr_gym_rs/hyperparams/ppo.yml @@ -1,4 +1,9 @@ FC16SSZwPT-v0: - n_envs: 24 - n_timesteps: !!float 1e6 + n_timesteps: 4_000_000 + n_envs: 32 # = size of rollout buffer / n_steps + n_steps: 8192 # = size of rollout buffer / n_envs + learning_rate: 0.0001 # weight of each update + batch_size: 512 # number of steps to consider per update + n_epochs: 10 # how often to process each rollout buffer + ent_coef: 0.05 # entropy, exploration term policy: 'MlpPolicy'