From 6c636291adce428b46da7f85b30e301982a0b650 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sat, 7 Dec 2019 11:26:59 +1100
Subject: [PATCH 01/39] Silence some warnings in 1.14 at the expense of making
 1.14 the minimum version

---
 tf/chunkparser.py  |  10 +--
 tf/mixprec.py      |   4 +-
 tf/net_to_model.py |   8 +--
 tf/tfprocess.py    | 172 ++++++++++++++++++++++-----------------------
 tf/train.py        |   4 +-
 tf/update_steps.py |   8 +--
 tf/upgrade.py      |  20 +++---
 7 files changed, 113 insertions(+), 113 deletions(-)

diff --git a/tf/chunkparser.py b/tf/chunkparser.py
index 4a5d169d..9f654a23 100644
--- a/tf/chunkparser.py
+++ b/tf/chunkparser.py
@@ -141,10 +141,10 @@ def parse_function(planes, probs, winner, q):
         """
         Convert unpacked record batches to tensors for tensorflow training
         """
-        planes = tf.decode_raw(planes, tf.float32)
-        probs = tf.decode_raw(probs, tf.float32)
-        winner = tf.decode_raw(winner, tf.float32)
-        q = tf.decode_raw(q, tf.float32)
+        planes = tf.io.decode_raw(planes, tf.float32)
+        probs = tf.io.decode_raw(probs, tf.float32)
+        winner = tf.io.decode_raw(winner, tf.float32)
+        q = tf.io.decode_raw(q, tf.float32)
 
         planes = tf.reshape(planes, (ChunkParser.BATCH_SIZE, 112, 8*8))
         probs = tf.reshape(probs, (ChunkParser.BATCH_SIZE, 1858))
@@ -426,7 +426,7 @@ def test_tensorflow_parsing(self):
         best_q = best_q.reshape(batch_size, 3)
 
         # Pass it through tensorflow
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             graph = ChunkParser.parse_function(data[0], data[1], data[2], data[3])
             tf_planes, tf_probs, tf_winner, tf_q = sess.run(graph)
 
diff --git a/tf/mixprec.py b/tf/mixprec.py
index 889fb6ab..cf161ac0 100644
--- a/tf/mixprec.py
+++ b/tf/mixprec.py
@@ -16,7 +16,7 @@ def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
     if trainable and dtype != tf.float32:
         cast_name = name + '/fp16_cast'
         try:
-            cast_variable = tf.get_default_graph().get_tensor_by_name(
+            cast_variable = tf.compat.v1.get_default_graph().get_tensor_by_name(
                 cast_name + ':0')
         except KeyError:
             cast_variable = tf.cast(variable, dtype, name=cast_name)
@@ -25,7 +25,7 @@ def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
     return variable
 
 
-class LossScalingOptimizer(tf.train.Optimizer):
+class LossScalingOptimizer(tf.compat.v1.train.Optimizer):
     """An optimizer that scales loss and un-scales gradients."""
 
     def __init__(self, optimizer,
diff --git a/tf/net_to_model.py b/tf/net_to_model.py
index 7e6f52f0..8d3190d6 100755
--- a/tf/net_to_model.py
+++ b/tf/net_to_model.py
@@ -28,10 +28,10 @@
 weights = net.get_weights()
 
 x = [
-    tf.placeholder(tf.float32, [None, 112, 8*8]),
-    tf.placeholder(tf.float32, [None, 1858]),
-    tf.placeholder(tf.float32, [None, 3]),
-    tf.placeholder(tf.float32, [None, 3]),
+    tf.compat.v1.placeholder(tf.float32, [None, 112, 8*8]),
+    tf.compat.v1.placeholder(tf.float32, [None, 1858]),
+    tf.compat.v1.placeholder(tf.float32, [None, 3]),
+    tf.compat.v1.placeholder(tf.float32, [None, 3]),
     ]
 
 tfp = tfprocess.TFProcess(cfg)
diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 8719e402..c8268050 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -43,10 +43,10 @@ def weight_variable(shape, name=None, dtype=tf.float32):
     stddev = trunc_correction * np.sqrt(2.0 / (fan_in + fan_out))
     # Do not use a constant as the initializer, that will cause the
     # variable to be stored in wrong dtype.
-    weights = tf.get_variable(
+    weights = tf.compat.v1.get_variable(
         name, shape, dtype=dtype,
-        initializer=tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
-    tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, weights)
+        initializer=tf.compat.v1.truncated_normal_initializer(stddev=stddev))
+    tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, weights)
     return weights
 
 # Bias weights for layers not followed by BatchNorm
@@ -55,11 +55,11 @@ def weight_variable(shape, name=None, dtype=tf.float32):
 
 
 def bias_variable(shape, name=None, dtype=tf.float32):
-    return tf.get_variable(name, shape, dtype=dtype,
-            initializer=tf.zeros_initializer())
+    return tf.compat.v1.get_variable(name, shape, dtype=dtype,
+            initializer=tf.compat.v1.zeros_initializer())
 
 def conv2d(x, W):
-    return tf.nn.conv2d(x, W, data_format='NCHW',
+    return tf.nn.conv2d(input=x, filters=W, data_format='NCHW',
                         strides=[1, 1, 1, 1], padding='SAME')
 
 class TFProcess:
@@ -128,25 +128,25 @@ def __init__(self, cfg):
         self.renorm_max_d = self.cfg['training'].get('renorm_max_d', 0)
         self.renorm_momentum = self.cfg['training'].get('renorm_momentum', 0.99)
 
-        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.90,
+        gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.90,
                                     allow_growth=True, visible_device_list="{}".format(self.cfg['gpu']))
-        config = tf.ConfigProto(gpu_options=gpu_options)
-        self.session = tf.Session(config=config)
+        config = tf.compat.v1.ConfigProto(gpu_options=gpu_options)
+        self.session = tf.compat.v1.Session(config=config)
 
-        self.training = tf.placeholder(tf.bool)
+        self.training = tf.compat.v1.placeholder(tf.bool)
         self.global_step = tf.Variable(0, name='global_step', trainable=False)
-        self.learning_rate = tf.placeholder(tf.float32)
+        self.learning_rate = tf.compat.v1.placeholder(tf.float32)
 
     def init(self, dataset, train_iterator, test_iterator):
         # TF variables
-        self.handle = tf.placeholder(tf.string, shape=[])
-        iterator = tf.data.Iterator.from_string_handle(
-            self.handle, dataset.output_types, dataset.output_shapes)
+        self.handle = tf.compat.v1.placeholder(tf.string, shape=[])
+        iterator = tf.compat.v1.data.Iterator.from_string_handle(
+            self.handle, tf.compat.v1.data.get_output_types(dataset), tf.compat.v1.data.get_output_shapes(dataset))
         self.next_batch = iterator.get_next()
         self.train_handle = self.session.run(train_iterator.string_handle())
         self.test_handle = self.session.run(test_iterator.string_handle())
         # This forces trainable variables to be stored as fp32
-        with tf.variable_scope("fp32_storage",
+        with tf.compat.v1.variable_scope("fp32_storage",
                 custom_getter=float32_variable_storage_getter):
             self.init_net(self.next_batch)
 
@@ -168,14 +168,14 @@ def init_net(self, next_batch):
             move_is_legal = tf.greater_equal(self.y_, 0)
             # replace logits of illegal moves with large negative value (so that it doesn't affect policy of legal moves) without gradient
             illegal_filler = tf.zeros_like(self.y_conv) - 1.0e10
-            self.y_conv = tf.where(move_is_legal, self.y_conv, illegal_filler)
+            self.y_conv = tf.compat.v1.where_v2(move_is_legal, self.y_conv, illegal_filler)
         # y_ still has -1 on illegal moves, flush them to 0
         self.y_ = tf.nn.relu(self.y_)
 
         policy_cross_entropy = \
-            tf.nn.softmax_cross_entropy_with_logits(labels=self.y_,
+            tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels=tf.stop_gradient(self.y_),
                                                     logits=self.y_conv)
-        self.policy_loss = tf.reduce_mean(policy_cross_entropy)
+        self.policy_loss = tf.reduce_mean(input_tensor=policy_cross_entropy)
 
         q_ratio = self.cfg['training'].get('q_ratio', 0)
         assert 0 <= q_ratio <= 1
@@ -188,22 +188,22 @@ def init_net(self, next_batch):
         # Loss on value head
         if self.wdl:
             value_cross_entropy = \
-                tf.nn.softmax_cross_entropy_with_logits(labels=target,
+                tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels=tf.stop_gradient(target),
                                                     logits=self.z_conv)
-            self.value_loss = tf.reduce_mean(value_cross_entropy)
+            self.value_loss = tf.reduce_mean(input_tensor=value_cross_entropy)
             scalar_z_conv = tf.matmul(tf.nn.softmax(self.z_conv), wdl)
             self.mse_loss = \
-                tf.reduce_mean(tf.squared_difference(scalar_target, scalar_z_conv))
+                tf.reduce_mean(input_tensor=tf.math.squared_difference(scalar_target, scalar_z_conv))
         else:
             self.value_loss = tf.constant(0)
             self.mse_loss = \
-                tf.reduce_mean(tf.squared_difference(scalar_target, self.z_conv))
+                tf.reduce_mean(input_tensor=tf.math.squared_difference(scalar_target, self.z_conv))
 
         # Regularizer
-        regularizer = tf.contrib.layers.l2_regularizer(scale=0.0001)
-        reg_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
-        self.reg_term = \
-            tf.contrib.layers.apply_regularization(regularizer, reg_variables)
+        regularizer = tf.keras.regularizers.l2(l=0.5 * (0.0001))
+        reg_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
+        penalties = [regularizer(w) for w in reg_variables]
+        self.reg_term = tf.math.add_n(penalties)
 
         if self.model_dtype != tf.float32:
             self.reg_term = tf.cast(self.reg_term, tf.float32)
@@ -226,7 +226,7 @@ def init_net(self, next_batch):
 
         # You need to change the learning rate here if you are training
         # from a self-play training set, for example start with 0.005 instead.
-        opt_op = tf.train.MomentumOptimizer(
+        opt_op = tf.compat.v1.train.MomentumOptimizer(
             learning_rate=self.learning_rate, momentum=0.9, use_nesterov=True)
 
         opt_op = LossScalingOptimizer(opt_op, scale=self.loss_scale)
@@ -244,19 +244,19 @@ def init_net(self, next_batch):
                 var = tf.Variable(
                     tf.zeros(shape=w.shape), name='swa/'+name, trainable=False)
                 accum.append(
-                    tf.assign(var, var * (n / (n + 1.)) + tf.stop_gradient(w) * (1. / (n + 1.))))
-                load.append(tf.assign(w, var))
+                    tf.compat.v1.assign(var, var * (n / (n + 1.)) + tf.stop_gradient(w) * (1. / (n + 1.))))
+                load.append(tf.compat.v1.assign(w, var))
             with tf.control_dependencies(accum):
-                self.swa_accum_op = tf.assign_add(n, 1.)
+                self.swa_accum_op = tf.compat.v1.assign_add(n, 1.)
             self.swa_load_op = tf.group(*load)
 
         # Accumulate (possibly multiple) gradient updates to simulate larger batch sizes than can be held in GPU memory.
         gradient_accum = [tf.Variable(tf.zeros_like(
-            var.initialized_value()), trainable=False) for var in tf.trainable_variables()]
+            var.initialized_value()), trainable=False) for var in tf.compat.v1.trainable_variables()]
         self.zero_op = [var.assign(tf.zeros_like(var))
                         for var in gradient_accum]
 
-        self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        self.update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
         with tf.control_dependencies(self.update_ops):
             gradients = opt_op.compute_gradients(loss)
         self.accum_op = [accum.assign_add(
@@ -270,13 +270,13 @@ def init_net(self, next_batch):
             [(accum, gradient[1]) for accum, gradient in zip(gradient_accum, gradients)], global_step=self.global_step)
 
         correct_policy_prediction = \
-            tf.equal(tf.argmax(self.y_conv, 1), tf.argmax(self.y_, 1))
+            tf.equal(tf.argmax(input=self.y_conv, axis=1), tf.argmax(input=self.y_, axis=1))
         correct_policy_prediction = tf.cast(correct_policy_prediction, tf.float32)
-        self.policy_accuracy = tf.reduce_mean(correct_policy_prediction)
+        self.policy_accuracy = tf.reduce_mean(input_tensor=correct_policy_prediction)
         correct_value_prediction = \
-            tf.equal(tf.argmax(self.z_conv, 1), tf.argmax(self.z_, 1))
+            tf.equal(tf.argmax(input=self.z_conv, axis=1), tf.argmax(input=self.z_, axis=1))
         correct_value_prediction = tf.cast(correct_value_prediction, tf.float32)
-        self.value_accuracy = tf.reduce_mean(correct_value_prediction)
+        self.value_accuracy = tf.reduce_mean(input_tensor=correct_value_prediction)
 
         self.avg_policy_loss = []
         self.avg_value_loss = []
@@ -286,18 +286,18 @@ def init_net(self, next_batch):
         self.last_steps = None
 
         # Summary part
-        self.test_writer = tf.summary.FileWriter(
+        self.test_writer = tf.compat.v1.summary.FileWriter(
             os.path.join(os.getcwd(), "leelalogs/{}-test".format(self.cfg['name'])))
-        self.train_writer = tf.summary.FileWriter(
+        self.train_writer = tf.compat.v1.summary.FileWriter(
             os.path.join(os.getcwd(), "leelalogs/{}-train".format(self.cfg['name'])))
         if self.swa_enabled:
-            self.swa_writer = tf.summary.FileWriter(
+            self.swa_writer = tf.compat.v1.summary.FileWriter(
                 os.path.join(os.getcwd(), "leelalogs/{}-swa-test".format(self.cfg['name'])))
-        self.histograms = [tf.summary.histogram(
+        self.histograms = [tf.compat.v1.summary.histogram(
             weight.name, weight) for weight in self.weights]
 
-        self.init = tf.global_variables_initializer()
-        self.saver = tf.train.Saver()
+        self.init = tf.compat.v1.global_variables_initializer()
+        self.saver = tf.compat.v1.train.Saver()
 
         self.session.run(self.init)
 
@@ -325,7 +325,7 @@ def replace_weights(self, new_weights):
                 shape = [s[i] for i in [3, 2, 0, 1]]
                 new_weight = tf.constant(new_weights[e], shape=shape)
                 all_evals.append(weights.assign(
-                    tf.transpose(new_weight, [2, 3, 1, 0])))
+                    tf.transpose(a=new_weight, perm=[2, 3, 1, 0])))
             elif weights.shape.ndims == 2:
                 # Fully connected layers are [in, out] in TF
                 #
@@ -335,11 +335,11 @@ def replace_weights(self, new_weights):
                 shape = [s[i] for i in [1, 0]]
                 new_weight = tf.constant(new_weights[e], shape=shape)
                 all_evals.append(weights.assign(
-                    tf.transpose(new_weight, [1, 0])))
+                    tf.transpose(a=new_weight, perm=[1, 0])))
             else:
                 # Biases, batchnorm etc
                 new_weight = tf.constant(new_weights[e], shape=weights.shape)
-                all_evals.append(tf.assign(weights, new_weight))
+                all_evals.append(tf.compat.v1.assign(weights, new_weight))
         self.session.run(all_evals)
         # This should result in identical file to the starting one
         # self.save_leelaz_weights('restored.txt')
@@ -351,7 +351,7 @@ def restore(self, file):
     def process_loop(self, batch_size, test_batches, batch_splits=1):
         # Get the initial steps value in case this is a resume from a step count
         # which is not a multiple of total_steps.
-        steps = tf.train.global_step(self.session, self.global_step)
+        steps = tf.compat.v1.train.global_step(self.session, self.global_step)
         total_steps = self.cfg['training']['total_steps']
         for _ in range(steps % total_steps, total_steps):
             self.process(batch_size, test_batches, batch_splits=batch_splits)
@@ -361,7 +361,7 @@ def process(self, batch_size, test_batches, batch_splits=1):
             self.time_start = time.time()
 
         # Get the initial steps value before we do a training step.
-        steps = tf.train.global_step(self.session, self.global_step)
+        steps = tf.compat.v1.train.global_step(self.session, self.global_step)
         if not self.last_steps:
             self.last_steps = steps
 
@@ -419,7 +419,7 @@ def process(self, batch_size, test_batches, batch_splits=1):
                                         feed_dict={self.learning_rate: corrected_lr, self.training: True, self.handle: self.train_handle})
 
         # Update steps since training should have incremented it.
-        steps = tf.train.global_step(self.session, self.global_step)
+        steps = tf.compat.v1.train.global_step(self.session, self.global_step)
 
         if steps % self.cfg['training']['train_avg_report_steps'] == 0 or steps % self.cfg['training']['total_steps'] == 0:
             pol_loss_w = self.cfg['training']['policy_loss_weight']
@@ -443,14 +443,14 @@ def process(self, batch_size, test_batches, batch_splits=1):
             update_ratio_summaries = self.compute_update_ratio(
                 before_weights, after_weights)
 
-            train_summaries = tf.Summary(value=[
-                tf.Summary.Value(tag="Policy Loss", simple_value=avg_policy_loss),
-                tf.Summary.Value(tag="Value Loss", simple_value=avg_value_loss),
-                tf.Summary.Value(tag="Reg term", simple_value=avg_reg_term),
-                tf.Summary.Value(tag="LR", simple_value=self.lr),
-                tf.Summary.Value(tag="Gradient norm",
+            train_summaries = tf.compat.v1.Summary(value=[
+                tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=avg_policy_loss),
+                tf.compat.v1.Summary.Value(tag="Value Loss", simple_value=avg_value_loss),
+                tf.compat.v1.Summary.Value(tag="Reg term", simple_value=avg_reg_term),
+                tf.compat.v1.Summary.Value(tag="LR", simple_value=self.lr),
+                tf.compat.v1.Summary.Value(tag="Gradient norm",
                                  simple_value=grad_norm / batch_splits),
-                tf.Summary.Value(tag="MSE Loss", simple_value=avg_mse_loss)])
+                tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=avg_mse_loss)])
             self.train_writer.add_summary(train_summaries, steps)
             self.train_writer.add_summary(update_ratio_summaries, steps)
             self.time_start = time_end
@@ -524,18 +524,18 @@ def calculate_test_summaries(self, test_batches, steps):
         # TODO store value and value accuracy in pb
         self.net.pb.training_params.accuracy = sum_policy_accuracy
         if self.wdl:
-            test_summaries = tf.Summary(value=[
-                tf.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
-                tf.Summary.Value(tag="Value Accuracy", simple_value=sum_value_accuracy),
-                tf.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
-                tf.Summary.Value(tag="Value Loss", simple_value=sum_value),
-                tf.Summary.Value(tag="MSE Loss", simple_value=sum_mse)]).SerializeToString()
+            test_summaries = tf.compat.v1.Summary(value=[
+                tf.compat.v1.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
+                tf.compat.v1.Summary.Value(tag="Value Accuracy", simple_value=sum_value_accuracy),
+                tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
+                tf.compat.v1.Summary.Value(tag="Value Loss", simple_value=sum_value),
+                tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=sum_mse)]).SerializeToString()
         else:
-            test_summaries = tf.Summary(value=[
-                tf.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
-                tf.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
-                tf.Summary.Value(tag="MSE Loss", simple_value=sum_mse)]).SerializeToString()
-        test_summaries = tf.summary.merge(
+            test_summaries = tf.compat.v1.Summary(value=[
+                tf.compat.v1.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
+                tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
+                tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=sum_mse)]).SerializeToString()
+        test_summaries = tf.compat.v1.summary.merge(
             [test_summaries] + self.histograms).eval(session=self.session)
         self.test_writer.add_summary(test_summaries, steps)
         print("step {}, policy={:g} value={:g} policy accuracy={:g}% value accuracy={:g}% mse={:g}".\
@@ -552,12 +552,12 @@ def compute_update_ratio(self, before_weights, after_weights):
         weight_norms = [np.linalg.norm(w.ravel()) for w in before_weights]
         ratios = [(tensor.name, d / w) for d, w, tensor in zip(delta_norms, weight_norms, self.weights) if not 'moving' in tensor.name]
         all_summaries = [
-            tf.Summary.Value(tag='update_ratios/' +
+            tf.compat.v1.Summary.Value(tag='update_ratios/' +
                              name, simple_value=ratio)
             for name, ratio in ratios]
         ratios = np.log10([r for (_, r) in ratios if 0 < r < np.inf])
         all_summaries.append(self.log_histogram('update_ratios_log10', ratios))
-        return tf.Summary(value=all_summaries)
+        return tf.compat.v1.Summary(value=all_summaries)
 
     def log_histogram(self, tag, values, bins=1000):
         """Logs the histogram of a list/vector of values.
@@ -571,7 +571,7 @@ def log_histogram(self, tag, values, bins=1000):
         counts, bin_edges = np.histogram(values, bins=bins)
 
         # Fill fields of histogram proto
-        hist = tf.HistogramProto()
+        hist = tf.compat.v1.HistogramProto()
         hist.min = float(np.min(values))
         hist.max = float(np.max(values))
         hist.num = int(np.prod(values.shape))
@@ -589,7 +589,7 @@ def log_histogram(self, tag, values, bins=1000):
         for c in counts:
             hist.bucket.append(c)
 
-        return tf.Summary.Value(tag=tag, histo=hist)
+        return tf.compat.v1.Summary.Value(tag=tag, histo=hist)
 
     def update_swa(self):
         # Add the current weight vars to the running average.
@@ -604,11 +604,11 @@ def snap_save(self):
             rest_ops = []
             for var in self.weights:
                 if isinstance(var, str):
-                    var = tf.get_default_graph().get_tensor_by_name(var)
+                    var = tf.compat.v1.get_default_graph().get_tensor_by_name(var)
                 name = var.name.split(':')[0]
                 v = tf.Variable(var, name='save/'+name, trainable=False)
-                save_ops.append(tf.assign(v, var))
-                rest_ops.append(tf.assign(var, v))
+                save_ops.append(tf.compat.v1.assign(v, var))
+                rest_ops.append(tf.compat.v1.assign(var, v))
             self.snap_save_op = tf.group(*save_ops)
             self.snap_restore_op = tf.group(*rest_ops)
         self.session.run(self.snap_save_op)
@@ -637,13 +637,13 @@ def save_leelaz_weights(self, filename):
                     #
                     # Leela/cuDNN/Caffe (kOutputInputYX)
                     # [output, input, filter_size, filter_size]
-                    work_weights = tf.transpose(weights, [3, 2, 0, 1])
+                    work_weights = tf.transpose(a=weights, perm=[3, 2, 0, 1])
                 elif weights.shape.ndims == 2:
                     # Fully connected layers are [in, out] in TF
                     #
                     # [out, in] in Leela
                     #
-                    work_weights = tf.transpose(weights, [1, 0])
+                    work_weights = tf.transpose(a=weights, perm=[1, 0])
                 else:
                     # Biases, batchnorm etc
                     work_weights = weights
@@ -677,7 +677,7 @@ def get_batchnorm_key(self):
     def add_weights(self, var):
         if var.name[-11:] == "fp16_cast:0":
             name = var.name[:-12] + ":0"
-            var = tf.get_default_graph().get_tensor_by_name(name)
+            var = tf.compat.v1.get_default_graph().get_tensor_by_name(name)
         # All trainable variables should be stored as fp32
         assert var.dtype.base_dtype == tf.float32
         self.weights.append(var)
@@ -687,7 +687,7 @@ def batch_norm(self, net, scope, scale=False):
         # a unique scope that we can store, and use to look them back up
         # later on.
 
-        with tf.variable_scope(scope, custom_getter=float32_variable_storage_getter):
+        with tf.compat.v1.variable_scope(scope, custom_getter=float32_variable_storage_getter):
             if self.renorm_enabled:
                 clipping = {
                     "rmin": 1.0/self.renorm_max_r,
@@ -695,7 +695,7 @@ def batch_norm(self, net, scope, scale=False):
                     "dmax": self.renorm_max_d
                     }
                 # Renorm has issues with fp16, cast to fp32.
-                net = tf.layers.batch_normalization(
+                net = tf.compat.v1.layers.batch_normalization(
                     tf.cast(net, tf.float32), epsilon=1e-5, axis=1, fused=True,
                     center=True, scale=scale,
                     renorm=True, renorm_clipping=clipping,
@@ -705,7 +705,7 @@ def batch_norm(self, net, scope, scale=False):
             else:
                 # Virtual batch doesn't work with fp16
                 virtual_batch = 64 if self.model_dtype == tf.float32 else None
-                net = tf.layers.batch_normalization(
+                net = tf.compat.v1.layers.batch_normalization(
                     net, epsilon=1e-5, axis=1, fused=True,
                     center=True, scale=scale,
                     virtual_batch_size=virtual_batch,
@@ -718,7 +718,7 @@ def batch_norm(self, net, scope, scale=False):
                                     dtype=tf.float32)
             else:
                 name = "fp32_storage/" + scope + '/batch_normalization/' + v + ':0'
-                var = tf.get_default_graph().get_tensor_by_name(name)
+                var = tf.compat.v1.get_default_graph().get_tensor_by_name(name)
             self.add_weights(var)
         return net
 
@@ -727,7 +727,7 @@ def squeeze_excitation(self, x, channels, ratio):
         assert channels % ratio == 0
 
         # NCHW format reduced to NC
-        net = tf.reduce_mean(x, axis=[2, 3])
+        net = tf.reduce_mean(input_tensor=x, axis=[2, 3])
 
         W_fc1 = weight_variable([channels, channels // ratio], name='se_fc1_w',
                                   dtype=self.model_dtype)
@@ -793,7 +793,7 @@ def residual_block(self, inputs, channels):
         self.add_weights(W_conv_2)
         h_bn2 = self.batch_norm(conv2d(h_out_1, W_conv_2), weight_key_2, scale=True)
 
-        with tf.variable_scope(weight_key_2):
+        with tf.compat.v1.variable_scope(weight_key_2):
             h_se = self.squeeze_excitation(h_bn2, channels, self.SE_ratio)
         h_out_2 = tf.nn.relu(tf.add(h_se, orig))
 
@@ -826,7 +826,7 @@ def construct_net(self, planes):
                                           dtype=self.model_dtype)
 
             self.add_weights(W_pol_conv)
-            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, b_pol_conv)
+            tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, b_pol_conv)
             self.add_weights(b_pol_conv)
 
             conv_pol2 = tf.nn.bias_add(
@@ -834,7 +834,7 @@ def construct_net(self, planes):
 
             h_conv_pol_flat = tf.reshape(conv_pol2, [-1, 80*8*8])
             fc1_init = tf.constant(lc0_az_policy_map.make_map(), dtype=self.model_dtype)
-            W_fc1 = tf.get_variable("policy_map",
+            W_fc1 = tf.compat.v1.get_variable("policy_map",
                                     initializer=fc1_init,
                                     trainable=False,
                                     dtype=self.model_dtype)
@@ -852,7 +852,7 @@ def construct_net(self, planes):
             b_fc1 = bias_variable([1858], name='fc1/bias',
                                   dtype=self.model_dtype)
             self.add_weights(W_fc1)
-            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, b_fc1)
+            tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, b_fc1)
             self.add_weights(b_fc1)
             h_fc1 = tf.add(tf.matmul(h_conv_pol_flat, W_fc1),
                            b_fc1, name='policy_head')
@@ -882,7 +882,7 @@ def construct_net(self, planes):
         if not self.wdl:
             h_fc3 = tf.nn.tanh(h_fc3)
         else:
-            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, b_fc3)
+            tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, b_fc3)
 
 
         return h_fc1, h_fc3
diff --git a/tf/train.py b/tf/train.py
index 7e9189cf..9384fb66 100755
--- a/tf/train.py
+++ b/tf/train.py
@@ -123,7 +123,7 @@ def main(cmd):
         train_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string))
     dataset = dataset.map(ChunkParser.parse_function)
     dataset = dataset.prefetch(4)
-    train_iterator = dataset.make_one_shot_iterator()
+    train_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
 
     shuffle_size = int(shuffle_size*(1.0-train_ratio))
     test_parser = ChunkParser(FileDataSrc(test_chunks),
@@ -132,7 +132,7 @@ def main(cmd):
         test_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string))
     dataset = dataset.map(ChunkParser.parse_function)
     dataset = dataset.prefetch(4)
-    test_iterator = dataset.make_one_shot_iterator()
+    test_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
 
     tfprocess = TFProcess(cfg)
     tfprocess.init(dataset, train_iterator, test_iterator)
diff --git a/tf/update_steps.py b/tf/update_steps.py
index 0d7795ad..49f357f2 100644
--- a/tf/update_steps.py
+++ b/tf/update_steps.py
@@ -17,10 +17,10 @@ def main(cmd):
         os.makedirs(root_dir)
 
     x = [
-        tf.placeholder(tf.float32, [None, 112, 8*8]),
-        tf.placeholder(tf.float32, [None, 1858]),
-        tf.placeholder(tf.float32, [None, 3]),
-        tf.placeholder(tf.float32, [None, 3]),
+        tf.compat.v1.placeholder(tf.float32, [None, 112, 8*8]),
+        tf.compat.v1.placeholder(tf.float32, [None, 1858]),
+        tf.compat.v1.placeholder(tf.float32, [None, 3]),
+        tf.compat.v1.placeholder(tf.float32, [None, 3]),
     ]
 
     tfprocess = TFProcess(cfg)
diff --git a/tf/upgrade.py b/tf/upgrade.py
index d18f7648..49e3c11d 100644
--- a/tf/upgrade.py
+++ b/tf/upgrade.py
@@ -17,10 +17,10 @@ def main(cmd):
         os.makedirs(root_dir)
 
     x = [
-        tf.placeholder(tf.float32, [None, 112, 8*8]),
-        tf.placeholder(tf.float32, [None, 1858]),
-        tf.placeholder(tf.float32, [None, 3]),
-        tf.placeholder(tf.float32, [None, 3]),
+        tf.compat.v1.placeholder(tf.float32, [None, 112, 8*8]),
+        tf.compat.v1.placeholder(tf.float32, [None, 1858]),
+        tf.compat.v1.placeholder(tf.float32, [None, 3]),
+        tf.compat.v1.placeholder(tf.float32, [None, 3]),
     ]
 
     tfprocess = TFProcess(cfg)
@@ -28,20 +28,20 @@ def main(cmd):
 
     if os.path.exists(os.path.join(root_dir, 'checkpoint')):
         cp = tf.train.latest_checkpoint(root_dir)
-        reader = tf.train.NewCheckpointReader(cp)
+        reader = tf.compat.v1.train.NewCheckpointReader(cp)
         saved_shapes = reader.get_variable_to_shape_map()
         new_names = sorted(
-            [var.name.split(':')[0] for var in tf.global_variables()
+            [var.name.split(':')[0] for var in tf.compat.v1.global_variables()
              if var.name.split(':')[0] not in saved_shapes])
         for saved_var_name in new_names:
             print("New name {} will use default value".format(saved_var_name))
         var_names = sorted(
-            [(var.name, var.name.split(':')[0]) for var in tf.global_variables()
+            [(var.name, var.name.split(':')[0]) for var in tf.compat.v1.global_variables()
              if var.name.split(':')[0] in saved_shapes])
         restore_vars = []
         restore_names = []
         for var_name, saved_var_name in var_names:
-            curr_var = tf.get_default_graph().get_tensor_by_name(var_name)
+            curr_var = tf.compat.v1.get_default_graph().get_tensor_by_name(var_name)
             var_shape = curr_var.get_shape().as_list()
             if var_shape == saved_shapes[saved_var_name]:
                 restore_vars.append(curr_var)
@@ -53,13 +53,13 @@ def main(cmd):
              if name not in restore_names])
         for saved_var_name in legacy_names:
             print("Dropping {} as no longer used".format(saved_var_name))
-        opt_saver = tf.train.Saver(restore_vars)
+        opt_saver = tf.compat.v1.train.Saver(restore_vars)
         opt_saver.restore(tfprocess.session, cp)
     else:
         print("No checkpoint to upgrade!")
         exit(1)
 
-    steps = tf.train.global_step(tfprocess.session, tfprocess.global_step)
+    steps = tf.compat.v1.train.global_step(tfprocess.session, tfprocess.global_step)
     path = os.path.join(root_dir, cfg['name'])
     save_path = tfprocess.saver.save(tfprocess.session, path, global_step=steps)
     tfprocess.session.close()

From a9f7129d1b344eca0599aa53f9f0994c5ffa1986 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sat, 7 Dec 2019 11:29:10 +1100
Subject: [PATCH 02/39] Update requirements.

---
 tf/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tf/requirements.txt b/tf/requirements.txt
index e395c72f..4184551a 100644
--- a/tf/requirements.txt
+++ b/tf/requirements.txt
@@ -1,3 +1,3 @@
 numpy==1.13.3
-tensorflow==1.12.2
+tensorflow==1.14.0
 tensorflow-tensorboard==0.4.0rc2

From 7ad15ced37ce7ca9a22efb3bf647dfb6c81c85c4 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sat, 7 Dec 2019 12:13:28 +1100
Subject: [PATCH 03/39] Minimal changes to get training running on 2.0

---
 tf/tfprocess.py | 4 +++-
 tf/train.py     | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index c8268050..3af5f9c0 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -132,7 +132,9 @@ def __init__(self, cfg):
                                     allow_growth=True, visible_device_list="{}".format(self.cfg['gpu']))
         config = tf.compat.v1.ConfigProto(gpu_options=gpu_options)
         self.session = tf.compat.v1.Session(config=config)
-
+        gpus = tf.config.experimental.list_physical_devices('GPU')
+        tf.config.experimental.set_visible_devices(gpus[self.cfg['gpu']], 'GPU')
+    
         self.training = tf.compat.v1.placeholder(tf.bool)
         self.global_step = tf.Variable(0, name='global_step', trainable=False)
         self.learning_rate = tf.compat.v1.placeholder(tf.float32)
diff --git a/tf/train.py b/tf/train.py
index 9384fb66..f632e52d 100755
--- a/tf/train.py
+++ b/tf/train.py
@@ -25,6 +25,7 @@
 import random
 import multiprocessing as mp
 import tensorflow as tf
+tf.compat.v1.disable_v2_behavior()
 from tfprocess import TFProcess
 from chunkparser import ChunkParser
 
@@ -116,6 +117,7 @@ def main(cmd):
     root_dir = os.path.join(cfg['training']['path'], cfg['name'])
     if not os.path.exists(root_dir):
         os.makedirs(root_dir)
+    tfprocess = TFProcess(cfg)
 
     train_parser = ChunkParser(FileDataSrc(train_chunks),
             shuffle_size=shuffle_size, sample=SKIP, batch_size=ChunkParser.BATCH_SIZE)
@@ -134,7 +136,6 @@ def main(cmd):
     dataset = dataset.prefetch(4)
     test_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
 
-    tfprocess = TFProcess(cfg)
     tfprocess.init(dataset, train_iterator, test_iterator)
 
     if os.path.exists(os.path.join(root_dir, 'checkpoint')):

From d170e2048a7214aca71f6a6dccdabff9053113bd Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sat, 7 Dec 2019 12:14:53 +1100
Subject: [PATCH 04/39] Update requirements.

---
 tf/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tf/requirements.txt b/tf/requirements.txt
index 4184551a..2be066a3 100644
--- a/tf/requirements.txt
+++ b/tf/requirements.txt
@@ -1,3 +1,3 @@
 numpy==1.13.3
-tensorflow==1.14.0
+tensorflow==2.0.0
 tensorflow-tensorboard==0.4.0rc2

From 3fcbe2b5f56fc0518cf74e8f5f27d5f68a4a22b0 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sat, 7 Dec 2019 13:34:11 +1100
Subject: [PATCH 05/39] Remove uses of get_variable since its initializer don't
 seem to be called anymore...

---
 tf/tfprocess.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 3af5f9c0..5bb2e687 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -43,9 +43,7 @@ def weight_variable(shape, name=None, dtype=tf.float32):
     stddev = trunc_correction * np.sqrt(2.0 / (fan_in + fan_out))
     # Do not use a constant as the initializer, that will cause the
     # variable to be stored in wrong dtype.
-    weights = tf.compat.v1.get_variable(
-        name, shape, dtype=dtype,
-        initializer=tf.compat.v1.truncated_normal_initializer(stddev=stddev))
+    weights = tf.Variable(tf.compat.v1.truncated_normal_initializer(stddev=stddev)(shape, dtype), name=name)
     tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, weights)
     return weights
 
@@ -55,8 +53,7 @@ def weight_variable(shape, name=None, dtype=tf.float32):
 
 
 def bias_variable(shape, name=None, dtype=tf.float32):
-    return tf.compat.v1.get_variable(name, shape, dtype=dtype,
-            initializer=tf.compat.v1.zeros_initializer())
+    return tf.Variable(tf.compat.v1.zeros_initializer()(shape, dtype), name=name)
 
 def conv2d(x, W):
     return tf.nn.conv2d(input=x, filters=W, data_format='NCHW',
@@ -836,10 +833,7 @@ def construct_net(self, planes):
 
             h_conv_pol_flat = tf.reshape(conv_pol2, [-1, 80*8*8])
             fc1_init = tf.constant(lc0_az_policy_map.make_map(), dtype=self.model_dtype)
-            W_fc1 = tf.compat.v1.get_variable("policy_map",
-                                    initializer=fc1_init,
-                                    trainable=False,
-                                    dtype=self.model_dtype)
+            W_fc1 = tf.Variable(fcl_init, trainable=False, name="policy_map")
 
             h_fc1 = tf.matmul(h_conv_pol_flat, W_fc1, name='policy_head')
         elif self.POLICY_HEAD == pb.NetworkFormat.POLICY_CLASSICAL:

From 162fa408f94c0d4d25fab57ac3e11c649db3cda9 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sat, 7 Dec 2019 14:02:45 +1100
Subject: [PATCH 06/39] Fix typo.

---
 tf/tfprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 5bb2e687..defeedc7 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -833,7 +833,7 @@ def construct_net(self, planes):
 
             h_conv_pol_flat = tf.reshape(conv_pol2, [-1, 80*8*8])
             fc1_init = tf.constant(lc0_az_policy_map.make_map(), dtype=self.model_dtype)
-            W_fc1 = tf.Variable(fcl_init, trainable=False, name="policy_map")
+            W_fc1 = tf.Variable(fc1_init, trainable=False, name="policy_map")
 
             h_fc1 = tf.matmul(h_conv_pol_flat, W_fc1, name='policy_head')
         elif self.POLICY_HEAD == pb.NetworkFormat.POLICY_CLASSICAL:

From ff2e2565bcf75823139080a593e040bdc330249d Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sun, 8 Dec 2019 12:11:51 +1100
Subject: [PATCH 07/39] Migration in progress some more.

This point 'basic' training seems to be functional, maybe.
No test data processing, no grad norm clipping, no swa, no metrics, no saving to pb.gz - lots still to do.
---
 tf/tfprocess.py | 349 ++++++++++++++++++++++++++++++++++++++++++++----
 tf/train.py     |  43 +++---
 2 files changed, 349 insertions(+), 43 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index defeedc7..f26b5c4b 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -28,29 +28,34 @@
 
 from net import Net
 
-
-def weight_variable(shape, name=None, dtype=tf.float32):
-    """Xavier initialization"""
-    if len(shape) == 4:
-        receptive_field = shape[0] * shape[1]
-        fan_in = shape[2] * receptive_field
-        fan_out = shape[3] * receptive_field
-    else:
-        fan_in = shape[0]
-        fan_out = shape[1]
-    # truncated normal has lower stddev than a regular normal distribution, so need to correct for that
-    trunc_correction = np.sqrt(1.3)
-    stddev = trunc_correction * np.sqrt(2.0 / (fan_in + fan_out))
-    # Do not use a constant as the initializer, that will cause the
-    # variable to be stored in wrong dtype.
-    weights = tf.Variable(tf.compat.v1.truncated_normal_initializer(stddev=stddev)(shape, dtype), name=name)
-    tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, weights)
-    return weights
-
 # Bias weights for layers not followed by BatchNorm
 # We do not regularlize biases, so they are not
 # added to the regularlizer collection
 
+class ApplySqueezeExcitation(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super(ApplySqueezeExcitation, self).__init__(**kwargs)
+
+    def build(self, input_dimens):
+        self.reshape_size = input_dimens[1][1]
+
+    def call(self, inputs):
+        x = inputs[0]
+        excited = inputs[1]
+        gammas, betas = tf.split(tf.reshape(excited, [-1, self.reshape_size, 1, 1]), 2, axis=1)
+        return tf.nn.sigmoid(gammas) * x + betas
+
+
+class ApplyPolicyMap(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super(ApplyPolicyMap, self).__init__(**kwargs)
+        fc1_init = tf.constant(lc0_az_policy_map.make_map())
+        self.fc1 = tf.Variable(fc1_init, trainable=False)
+
+    def call(self, inputs):
+        h_conv_pol_flat = tf.reshape(inputs, [-1, 80*8*8])
+        return tf.matmul(h_conv_pol_flat, self.fc1)
+
 
 def bias_variable(shape, name=None, dtype=tf.float32):
     return tf.Variable(tf.compat.v1.zeros_initializer()(shape, dtype), name=name)
@@ -132,11 +137,22 @@ def __init__(self, cfg):
         gpus = tf.config.experimental.list_physical_devices('GPU')
         tf.config.experimental.set_visible_devices(gpus[self.cfg['gpu']], 'GPU')
     
-        self.training = tf.compat.v1.placeholder(tf.bool)
         self.global_step = tf.Variable(0, name='global_step', trainable=False)
-        self.learning_rate = tf.compat.v1.placeholder(tf.float32)
+
+    def init_v2(self, train_dataset, test_dataset):
+        self.l2reg = tf.keras.regularizers.l2(l=0.5 * (0.0001))
+        self.train_dataset = train_dataset
+        self.train_iter = iter(train_dataset)
+        self.test_dataset = test_dataset
+        self.test_iter = iter(test_dataset)
+        self.init_net_v2()
+        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model, global_step=self.global_step)
+        self.manager = tf.train.CheckpointManager(
+            self.checkpoint, directory=self.root_dir, max_to_keep=50, keep_checkpoint_every_n_hours=24)
 
     def init(self, dataset, train_iterator, test_iterator):
+        self.training = tf.compat.v1.placeholder(tf.bool)
+        self.learning_rate = tf.compat.v1.placeholder(tf.float32)
         # TF variables
         self.handle = tf.compat.v1.placeholder(tf.string, shape=[])
         iterator = tf.compat.v1.data.Iterator.from_string_handle(
@@ -144,11 +160,82 @@ def init(self, dataset, train_iterator, test_iterator):
         self.next_batch = iterator.get_next()
         self.train_handle = self.session.run(train_iterator.string_handle())
         self.test_handle = self.session.run(test_iterator.string_handle())
+        self.l2reg = tf.keras.regularizers.l2(l=0.5 * (0.0001))
+
         # This forces trainable variables to be stored as fp32
         with tf.compat.v1.variable_scope("fp32_storage",
                 custom_getter=float32_variable_storage_getter):
             self.init_net(self.next_batch)
 
+    def init_net_v2(self):
+        input_var = tf.keras.Input(shape=(112, 8*8))
+        x_planes = tf.keras.layers.Reshape([112, 8, 8])(input_var)
+        self.model = tf.keras.Model(inputs=input_var, outputs=self.construct_net_v2(x_planes))
+        self.active_lr = 0.01
+        # TODO set up optimizers and loss functions.
+        self.optimizer = tf.keras.optimizers.SGD(learning_rate=lambda: self.active_lr, momentum=0.9, nesterov=True)
+        def policy_loss(target, output):                
+            # Calculate loss on policy head
+            if self.cfg['training'].get('mask_legal_moves'):
+                # extract mask for legal moves from target policy
+                move_is_legal = tf.greater_equal(target, 0)
+                # replace logits of illegal moves with large negative value (so that it doesn't affect policy of legal moves) without gradient
+                illegal_filler = tf.zeros_like(output) - 1.0e10
+                output = tf.where(move_is_legal, output, illegal_filler)
+            # y_ still has -1 on illegal moves, flush them to 0
+            target = tf.nn.relu(target)
+
+            policy_cross_entropy = \
+                tf.nn.softmax_cross_entropy_with_logits(labels=tf.stop_gradient(target),
+                                                        logits=output)
+            return tf.reduce_mean(input_tensor=policy_cross_entropy)
+        self.policy_loss_fn = policy_loss
+
+
+        q_ratio = self.cfg['training'].get('q_ratio', 0)
+        assert 0 <= q_ratio <= 1
+
+        # Linear conversion to scalar to compute MSE with, for comparison to old values
+        wdl = tf.expand_dims(tf.constant([1.0, 0.0, -1.0]), 1)
+
+        self.qMix = lambda z, q: q * q_ratio + z *(1 - q_ratio)
+        # Loss on value head
+        if self.wdl:
+            def value_loss(target, output):
+                value_cross_entropy = \
+                    tf.nn.softmax_cross_entropy_with_logits(labels=tf.stop_gradient(target),
+                                                    logits=output)
+                return tf.reduce_mean(input_tensor=value_cross_entropy)
+            self.value_loss_fn = value_loss
+            def mse_loss(target, output):
+                scalar_z_conv = tf.matmul(tf.nn.softmax(output), wdl)
+                scalar_target = tf.matmul(target, wdl)
+                return tf.reduce_mean(input_tensor=tf.math.squared_difference(scalar_target, scalar_z_conv))
+            self.mse_loss_fn = mse_loss
+        else:
+            def value_loss(target, output):
+                return tf.constant(0)
+            self.value_loss_fn = value_loss
+            def mse_loss(target, output):
+                scalar_target = tf.matmul(target, wdl)
+                return tf.reduce_mean(input_tensor=tf.math.squared_difference(scalar_target, output))
+            self.mse_loss_fn = mse_loss
+
+        pol_loss_w = self.cfg['training']['policy_loss_weight']
+        val_loss_w = self.cfg['training']['value_loss_weight']
+        self.lossMix = lambda policy, value: pol_loss_w * policy + val_loss_w * value
+        
+        self.avg_policy_loss = []
+        self.avg_value_loss = []
+        self.avg_mse_loss = []
+        self.avg_reg_term = []
+        self.time_start = None
+        self.last_steps = None
+        # Set adaptive learning rate during training
+        self.cfg['training']['lr_boundaries'].sort()
+        self.warmup_steps = self.cfg['training'].get('warmup_steps', 0)
+        self.lr = self.cfg['training']['lr_values'][0]
+
     def init_net(self, next_batch):
         self.x = next_batch[0]  # tf.placeholder(tf.float32, [None, 112, 8*8])
         self.y_ = next_batch[1] # tf.placeholder(tf.float32, [None, 1858])
@@ -199,9 +286,8 @@ def init_net(self, next_batch):
                 tf.reduce_mean(input_tensor=tf.math.squared_difference(scalar_target, self.z_conv))
 
         # Regularizer
-        regularizer = tf.keras.regularizers.l2(l=0.5 * (0.0001))
         reg_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-        penalties = [regularizer(w) for w in reg_variables]
+        penalties = [self.l2reg(w) for w in reg_variables]
         self.reg_term = tf.math.add_n(penalties)
 
         if self.model_dtype != tf.float32:
@@ -343,10 +429,24 @@ def replace_weights(self, new_weights):
         # This should result in identical file to the starting one
         # self.save_leelaz_weights('restored.txt')
 
+    def restore_v2(self):
+        if self.manager.latest_checkpoint is not None:
+            print("Restoring from {0}".format(self.manager.latest_checkpoint))
+            self.checkpoint.restore(self.manager.latest_checkpoint)
+
+
     def restore(self, file):
         print("Restoring from {0}".format(file))
         self.saver.restore(self.session, file)
 
+    def process_loop_v2(self, batch_size, test_batches, batch_splits=1):
+        # Get the initial steps value in case this is a resume from a step count
+        # which is not a multiple of total_steps.
+        steps = self.global_step.read_value()
+        total_steps = self.cfg['training']['total_steps']
+        for _ in range(steps % total_steps, total_steps):
+            self.process_v2(batch_size, test_batches, batch_splits=batch_splits)
+
     def process_loop(self, batch_size, test_batches, batch_splits=1):
         # Get the initial steps value in case this is a resume from a step count
         # which is not a multiple of total_steps.
@@ -355,6 +455,145 @@ def process_loop(self, batch_size, test_batches, batch_splits=1):
         for _ in range(steps % total_steps, total_steps):
             self.process(batch_size, test_batches, batch_splits=batch_splits)
 
+    def process_v2(self, batch_size, test_batches, batch_splits=1):
+        if not self.time_start:
+            self.time_start = time.time()
+
+        # Get the initial steps value before we do a training step.
+        steps = self.global_step.read_value()
+        if not self.last_steps:
+            self.last_steps = steps
+
+        if self.swa_enabled:
+            # split half of test_batches between testing regular weights and SWA weights
+            test_batches //= 2
+
+        # Run test before first step to see delta since end of last run.
+        #if steps % self.cfg['training']['total_steps'] == 0:
+            # Steps is given as one higher than current in order to avoid it
+            # being equal to the value the end of a run is stored against.
+        #    self.calculate_test_summaries_v2(test_batches, steps + 1)
+        #    if self.swa_enabled:
+        #        self.calculate_swa_summaries_v2(test_batches, steps + 1)
+
+        # Make sure that ghost batch norm can be applied
+        if batch_size % 64 != 0:
+            # Adjust required batch size for batch splitting.
+            required_factor = 64 * \
+                self.cfg['training'].get('num_batch_splits', 1)
+            raise ValueError(
+                'batch_size must be a multiple of {}'.format(required_factor))
+
+        # Determine learning rate
+        lr_values = self.cfg['training']['lr_values']
+        lr_boundaries = self.cfg['training']['lr_boundaries']
+        steps_total = steps % self.cfg['training']['total_steps']
+        self.lr = lr_values[bisect.bisect_right(lr_boundaries, steps_total)]
+        if self.warmup_steps > 0 and steps < self.warmup_steps:
+            self.lr = self.lr * tf.cast(steps + 1, tf.float32) / self.warmup_steps
+
+        # need to add 1 to steps because steps will be incremented after gradient update
+        #if (steps + 1) % self.cfg['training']['train_avg_report_steps'] == 0 or (steps + 1) % self.cfg['training']['total_steps'] == 0:
+        #    before_weights = self.session.run(self.weights)
+
+        # Run training for this batch
+        grads = None
+        for _ in range(batch_splits):
+            x, y, z, q = next(self.train_iter)
+            with tf.GradientTape() as tape:
+                policy, value = self.model(x)
+                policy_loss = self.policy_loss_fn(y, policy)                    
+                reg_term = sum(self.model.losses)
+                if self.wdl:
+                    value_loss = self.value_loss_fn(self.qMix(z, q), value)
+                    total_loss = self.lossMix(policy_loss, value_loss) + reg_term
+                else:
+                    mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
+                    total_loss = self.lossMix(policy_loss, mse_loss) + reg_term
+            if self.wdl:
+                mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
+            if not grads:
+                grads = tape.gradient(total_loss, self.model.trainable_weights)
+            else:
+                grads += tape.gradient(total_loss, self.model.trainable_weights)
+            # Keep running averages
+            # Google's paper scales MSE by 1/4 to a [0, 1] range, so do the same to
+            # get comparable values.
+            mse_loss /= 4.0
+            self.avg_policy_loss.append(policy_loss)
+            if self.wdl:
+                self.avg_value_loss.append(value_loss)
+            self.avg_mse_loss.append(mse_loss)
+            self.avg_reg_term.append(reg_term)
+        # Gradients of batch splits are summed, not averaged like usual, so need to scale lr accordingly to correct for this.        
+        self.active_lr = self.lr / batch_splits
+        self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
+        #grad_norm = compute_norm(self.model.trainable_weights)
+
+        # Update steps.
+        self.global_step.assign_add(1)
+        steps = self.global_step.read_value()
+
+        if steps % self.cfg['training']['train_avg_report_steps'] == 0 or steps % self.cfg['training']['total_steps'] == 0:
+            pol_loss_w = self.cfg['training']['policy_loss_weight']
+            val_loss_w = self.cfg['training']['value_loss_weight']
+            time_end = time.time()
+            speed = 0
+            if self.time_start:
+                elapsed = time_end - self.time_start
+                steps_elapsed = steps - self.last_steps
+                speed = batch_size * (tf.cast(steps_elapsed, tf.float32) / elapsed)
+            avg_policy_loss = np.mean(self.avg_policy_loss or [0])
+            avg_value_loss = np.mean(self.avg_value_loss or [0])
+            avg_mse_loss = np.mean(self.avg_mse_loss or [0])
+            avg_reg_term = np.mean(self.avg_reg_term or [0])
+            print("step {}, lr={:g} policy={:g} value={:g} mse={:g} reg={:g} total={:g} ({:g} pos/s)".format(
+                steps, self.lr, avg_policy_loss, avg_value_loss, avg_mse_loss, avg_reg_term,
+                pol_loss_w * avg_policy_loss + val_loss_w * avg_value_loss + avg_reg_term,
+                speed))
+
+            #after_weights = self.session.run(self.weights)
+            #update_ratio_summaries = self.compute_update_ratio(
+            #    before_weights, after_weights)
+
+            #train_summaries = tf.compat.v1.Summary(value=[
+            #    tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=avg_policy_loss),
+            #    tf.compat.v1.Summary.Value(tag="Value Loss", simple_value=avg_value_loss),
+            #    tf.compat.v1.Summary.Value(tag="Reg term", simple_value=avg_reg_term),
+            #    tf.compat.v1.Summary.Value(tag="LR", simple_value=self.lr),
+            #    tf.compat.v1.Summary.Value(tag="Gradient norm",
+            #                     simple_value=grad_norm / batch_splits),
+            #    tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=avg_mse_loss)])
+            #self.train_writer.add_summary(train_summaries, steps)
+            #self.train_writer.add_summary(update_ratio_summaries, steps)
+            self.time_start = time_end
+            self.last_steps = steps
+            self.avg_policy_loss, self.avg_value_loss, self.avg_mse_loss, self.avg_reg_term = [], [], [], []
+
+        #if self.swa_enabled and steps % self.cfg['training']['swa_steps'] == 0:
+        #    self.update_swa_v2()
+
+        # Calculate test values every 'test_steps', but also ensure there is
+        # one at the final step so the delta to the first step can be calculted.
+        #if steps % self.cfg['training']['test_steps'] == 0 or steps % self.cfg['training']['total_steps'] == 0:
+        #    self.calculate_test_summaries_v2(test_batches, steps)
+        #    if self.swa_enabled:
+        #        self.calculate_swa_summaries_v2(test_batches, steps)
+
+        # Save session and weights at end, and also optionally every 'checkpoint_steps'.
+        if steps % self.cfg['training']['total_steps'] == 0 or (
+                'checkpoint_steps' in self.cfg['training'] and steps % self.cfg['training']['checkpoint_steps'] == 0):
+            self.manager.save()
+            #print("Model saved in file: {}".format(save_path))
+            #leela_path = path + "-" + str(steps)
+            #swa_path = path + "-swa-" + str(steps)
+            self.net.pb.training_params.training_steps = steps
+            #self.save_leelaz_weights(leela_path)
+            #print("Weights saved in file: {}".format(leela_path))
+            #if self.swa_enabled:
+            #    self.save_swa_weights(swa_path)
+            #    print("SWA Weights saved in file: {}".format(swa_path))
+
     def process(self, batch_size, test_batches, batch_splits=1):
         if not self.time_start:
             self.time_start = time.time()
@@ -681,6 +920,23 @@ def add_weights(self, var):
         assert var.dtype.base_dtype == tf.float32
         self.weights.append(var)
 
+    def batch_norm_v2(self, input, scale=False):
+        if self.renorm_enabled:
+            clipping = {
+                "rmin": 1.0/self.renorm_max_r,
+                "rmax": self.renorm_max_r,
+                "dmax": self.renorm_max_d
+                }
+            return tf.keras.layers.BatchNormalization(
+                epsilon=1e-5, axis=1, fused=True, center=True,
+                scale=scale, renorm=True, renorm_clipping=clipping,
+                renorm_momentum=self.renorm_momentum)(input)
+        else:
+            return tf.keras.layers.BatchNormalization(
+                epsilon=1e-5, axis=1, fused=False, center=True,
+                scale=scale, virtual_batch_size=64)(input)
+            
+
     def batch_norm(self, net, scope, scale=False):
         # The weights are internal to the batchnorm layer, so apply
         # a unique scope that we can store, and use to look them back up
@@ -720,6 +976,13 @@ def batch_norm(self, net, scope, scale=False):
                 var = tf.compat.v1.get_default_graph().get_tensor_by_name(name)
             self.add_weights(var)
         return net
+                        
+
+    def squeeze_excitation_v2(self, inputs, channels):
+        pooled = tf.keras.layers.GlobalAveragePooling2D()(inputs)
+        squeezed = tf.keras.layers.Activation('relu')(tf.keras.layers.Dense(channels // self.SE_ratio, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg)(pooled))
+        excited = tf.keras.layers.Dense(2 * channels, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg)(squeezed)
+        return ApplySqueezeExcitation()([inputs, excited])
 
     def squeeze_excitation(self, x, channels, ratio):
 
@@ -755,6 +1018,10 @@ def squeeze_excitation(self, x, channels, ratio):
 
         return out
 
+    def conv_block_v2(self, inputs, filter_size, output_channels, bn_scale=False):
+        conv = tf.keras.layers.Conv2D(output_channels, filter_size, use_bias=False, padding='same', kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, data_format='channels_first')(inputs)
+        return tf.keras.layers.Activation('relu')(self.batch_norm_v2(conv, scale=bn_scale))
+
     def conv_block(self, inputs, filter_size, input_channels, output_channels, bn_scale=False):
         # The weights are internal to the batchnorm layer, so apply
         # a unique scope that we can store, and use to look them back up
@@ -771,6 +1038,14 @@ def conv_block(self, inputs, filter_size, input_channels, output_channels, bn_sc
 
         return h_conv
 
+    def residual_block_v2(self, inputs, channels):
+        conv1 = tf.keras.layers.Conv2D(channels, 3, use_bias=False, padding='same', kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, data_format='channels_first')(inputs)
+        out1 = tf.keras.layers.Activation('relu')(self.batch_norm_v2(conv1, scale=False))
+        conv2 = tf.keras.layers.Conv2D(channels, 3, use_bias=False, padding='same', kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, data_format='channels_first')(out1)
+        out2 = self.squeeze_excitation_v2(self.batch_norm_v2(conv1, scale=True), channels)
+        return tf.keras.layers.Activation('relu')(tf.keras.layers.add([inputs, out2]))
+        
+
     def residual_block(self, inputs, channels):
         # First convnet
         orig = tf.identity(inputs)
@@ -798,6 +1073,34 @@ def residual_block(self, inputs, channels):
 
         return h_out_2
 
+    def construct_net_v2(self, inputs):
+        flow = self.conv_block_v2(inputs, filter_size=3, output_channels=self.RESIDUAL_FILTERS, bn_scale=True)
+        for _ in range(0, self.RESIDUAL_BLOCKS):
+            flow = self.residual_block_v2(flow, self.RESIDUAL_FILTERS)
+        # Policy head
+        if self.POLICY_HEAD == pb.NetworkFormat.POLICY_CONVOLUTION:
+            conv_pol = self.conv_block_v2(flow, filter_size=3, output_channels=self.RESIDUAL_FILTERS)
+            conv_pol2 = tf.keras.layers.Conv2D(80, 3, use_bias=True, padding='same', kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, bias_regularizer=self.l2reg, data_format='channels_first')(conv_pol)
+            h_fc1 = ApplyPolicyMap()(conv_pol2)
+        elif self.POLICY_HEAD == pb.NetworkFormat.POLICY_CLASSICAL:
+            conv_pol = self.conv_block_v2(flow, filter_size=1, output_channels=self.policy_channels)
+            h_conv_pol_flat = tf.keras.layers.Flatten()(conv_pol)
+            h_fc1 = tf.keras.layers.Dense(1858, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, bias_regularizer=self.l2reg)(h_conv_pol_flat)
+        else:
+            raise ValueError(
+                "Unknown policy head type {}".format(self.POLICY_HEAD))
+
+        # Value head
+        conv_val = self.conv_block_v2(flow, filter_size=1, output_channels=32)
+        h_conv_val_flat = tf.keras.layers.Flatten()(conv_val)
+        h_fc2 = tf.keras.layers.Dense(128, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, activation='relu')(h_conv_val_flat)
+        if self.wdl:
+            h_fc3 = tf.keras.layers.Dense(3, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, bias_regularizer=self.l2reg)(h_fc2)
+        else:
+            h_fc3 = tf.keras.layers.Dense(1, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, activation='tanh')(h_fc2)
+        return h_fc1, h_fc3
+        
+
     def construct_net(self, planes):
         # NCHW format
         # batch, 112 input channels, 8 x 8
diff --git a/tf/train.py b/tf/train.py
index f632e52d..b9a7db0c 100755
--- a/tf/train.py
+++ b/tf/train.py
@@ -25,7 +25,7 @@
 import random
 import multiprocessing as mp
 import tensorflow as tf
-tf.compat.v1.disable_v2_behavior()
+#tf.compat.v1.disable_v2_behavior()
 from tfprocess import TFProcess
 from chunkparser import ChunkParser
 
@@ -121,26 +121,28 @@ def main(cmd):
 
     train_parser = ChunkParser(FileDataSrc(train_chunks),
             shuffle_size=shuffle_size, sample=SKIP, batch_size=ChunkParser.BATCH_SIZE)
-    dataset = tf.data.Dataset.from_generator(
+    train_dataset = tf.data.Dataset.from_generator(
         train_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string))
-    dataset = dataset.map(ChunkParser.parse_function)
-    dataset = dataset.prefetch(4)
-    train_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
+    train_dataset = train_dataset.map(ChunkParser.parse_function)
+    train_dataset = train_dataset.prefetch(4)
+    #train_iterator = tf.compat.v1.data.make_one_shot_iterator(train_dataset)
 
     shuffle_size = int(shuffle_size*(1.0-train_ratio))
     test_parser = ChunkParser(FileDataSrc(test_chunks),
             shuffle_size=shuffle_size, sample=SKIP, batch_size=ChunkParser.BATCH_SIZE)
-    dataset = tf.data.Dataset.from_generator(
+    test_dataset = tf.data.Dataset.from_generator(
         test_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string))
-    dataset = dataset.map(ChunkParser.parse_function)
-    dataset = dataset.prefetch(4)
-    test_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
+    test_dataset = test_dataset.map(ChunkParser.parse_function)
+    test_dataset = test_dataset.prefetch(4)
+    #test_iterator = tf.compat.v1.data.make_one_shot_iterator(test_dataset)
 
-    tfprocess.init(dataset, train_iterator, test_iterator)
+    #tfprocess.init(test_dataset, train_iterator, test_iterator)
+    tfprocess.init_v2(train_dataset, test_dataset)
 
-    if os.path.exists(os.path.join(root_dir, 'checkpoint')):
-        cp = tf.train.latest_checkpoint(root_dir)
-        tfprocess.restore(cp)
+    #if os.path.exists(os.path.join(root_dir, 'checkpoint')):
+    #    cp = tf.train.latest_checkpoint(root_dir)
+    #    tfprocess.restore(cp)
+    tfprocess.restore_v2()
 
     # If number of test positions is not given
     # sweeps through all test chunks statistically
@@ -151,15 +153,16 @@ def main(cmd):
     num_evals = max(1, num_evals // ChunkParser.BATCH_SIZE)
     print("Using {} evaluation batches".format(num_evals))
 
-    tfprocess.process_loop(total_batch_size, num_evals, batch_splits=batch_splits)
+    #tfprocess.process_loop(total_batch_size, num_evals, batch_splits=batch_splits)
+    tfprocess.process_loop_v2(total_batch_size, num_evals, batch_splits=batch_splits)
 
-    if cmd.output is not None:
-        if cfg['training'].get('swa_output', False):
-            tfprocess.save_swa_weights(cmd.output)
-        else:
-            tfprocess.save_leelaz_weights(cmd.output)
+    #if cmd.output is not None:
+    #    if cfg['training'].get('swa_output', False):
+    #        tfprocess.save_swa_weights(cmd.output)
+    #    else:
+    #        tfprocess.save_leelaz_weights(cmd.output)
 
-    tfprocess.session.close()
+    #tfprocess.session.close()
     train_parser.shutdown()
     test_parser.shutdown()
 

From f202c7884a6e7a56832a61031945e6db922a0d4d Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sun, 8 Dec 2019 13:08:32 +1100
Subject: [PATCH 08/39] Add grad norm clipping back in.

---
 tf/tfprocess.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index f26b5c4b..8119fa0c 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -527,8 +527,9 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             self.avg_reg_term.append(reg_term)
         # Gradients of batch splits are summed, not averaged like usual, so need to scale lr accordingly to correct for this.        
         self.active_lr = self.lr / batch_splits
+        max_grad_norm = self.cfg['training'].get('max_grad_norm', 10000.0) * batch_splits
+        grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
         self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
-        #grad_norm = compute_norm(self.model.trainable_weights)
 
         # Update steps.
         self.global_step.assign_add(1)

From 1398d2a52ddd242fdb93e7b64e1b85ab45679036 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sun, 8 Dec 2019 21:42:05 +1100
Subject: [PATCH 09/39] Add swa support and test result reporting.

---
 tf/tfprocess.py | 106 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 95 insertions(+), 11 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 8119fa0c..dc84be67 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -146,7 +146,8 @@ def init_v2(self, train_dataset, test_dataset):
         self.test_dataset = test_dataset
         self.test_iter = iter(test_dataset)
         self.init_net_v2()
-        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model, global_step=self.global_step)
+        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model, global_step=self.global_step, swa_count=self.swa_count)
+        self.checkpoint.listed = self.swa_weights
         self.manager = tf.train.CheckpointManager(
             self.checkpoint, directory=self.root_dir, max_to_keep=50, keep_checkpoint_every_n_hours=24)
 
@@ -171,6 +172,13 @@ def init_net_v2(self):
         input_var = tf.keras.Input(shape=(112, 8*8))
         x_planes = tf.keras.layers.Reshape([112, 8, 8])(input_var)
         self.model = tf.keras.Model(inputs=input_var, outputs=self.construct_net_v2(x_planes))
+        self.swa_count = None
+        self.swa_weights = None
+        if self.swa_enabled:
+            # Count of networks accumulated into SWA
+            self.swa_count = tf.Variable(0., name='swa_count', trainable=False)
+            self.swa_weights = [tf.Variable(w, trainable=False) for w in self.model.weights]
+        
         self.active_lr = 0.01
         # TODO set up optimizers and loss functions.
         self.optimizer = tf.keras.optimizers.SGD(learning_rate=lambda: self.active_lr, momentum=0.9, nesterov=True)
@@ -469,12 +477,12 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             test_batches //= 2
 
         # Run test before first step to see delta since end of last run.
-        #if steps % self.cfg['training']['total_steps'] == 0:
+        if steps % self.cfg['training']['total_steps'] == 0:
             # Steps is given as one higher than current in order to avoid it
             # being equal to the value the end of a run is stored against.
-        #    self.calculate_test_summaries_v2(test_batches, steps + 1)
-        #    if self.swa_enabled:
-        #        self.calculate_swa_summaries_v2(test_batches, steps + 1)
+            self.calculate_test_summaries_v2(test_batches, steps + 1)
+            if self.swa_enabled:
+                self.calculate_swa_summaries_v2(test_batches, steps + 1)
 
         # Make sure that ghost batch norm can be applied
         if batch_size % 64 != 0:
@@ -512,6 +520,8 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
                     total_loss = self.lossMix(policy_loss, mse_loss) + reg_term
             if self.wdl:
                 mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
+            else:
+                value_loss = self.value_loss_fn(self.qMix(z, q), value)
             if not grads:
                 grads = tape.gradient(total_loss, self.model.trainable_weights)
             else:
@@ -571,15 +581,15 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             self.last_steps = steps
             self.avg_policy_loss, self.avg_value_loss, self.avg_mse_loss, self.avg_reg_term = [], [], [], []
 
-        #if self.swa_enabled and steps % self.cfg['training']['swa_steps'] == 0:
-        #    self.update_swa_v2()
+        if self.swa_enabled and steps % self.cfg['training']['swa_steps'] == 0:
+            self.update_swa_v2()
 
         # Calculate test values every 'test_steps', but also ensure there is
         # one at the final step so the delta to the first step can be calculted.
-        #if steps % self.cfg['training']['test_steps'] == 0 or steps % self.cfg['training']['total_steps'] == 0:
-        #    self.calculate_test_summaries_v2(test_batches, steps)
-        #    if self.swa_enabled:
-        #        self.calculate_swa_summaries_v2(test_batches, steps)
+        if steps % self.cfg['training']['test_steps'] == 0 or steps % self.cfg['training']['total_steps'] == 0:
+            self.calculate_test_summaries_v2(test_batches, steps)
+            if self.swa_enabled:
+                self.calculate_swa_summaries_v2(test_batches, steps)
 
         # Save session and weights at end, and also optionally every 'checkpoint_steps'.
         if steps % self.cfg['training']['total_steps'] == 0 or (
@@ -721,6 +731,17 @@ def process(self, batch_size, test_batches, batch_splits=1):
                 self.save_swa_weights(swa_path)
                 print("SWA Weights saved in file: {}".format(swa_path))
 
+    def calculate_swa_summaries_v2(self, test_batches, steps):
+        backup = [w.read_value() for w in self.model.weights]
+        for (swa, w) in zip(self.swa_weights, self.model.weights):
+            w.assign(swa.read_value())
+        #true_test_writer, self.test_writer = self.test_writer, self.swa_writer
+        print('swa', end=' ')
+        self.calculate_test_summaries_v2(test_batches, steps)
+        #self.test_writer = true_test_writer
+        for (old, w) in zip(backup, self.model.weights):
+            w.assign(old)
+
     def calculate_swa_summaries(self, test_batches, steps):
         self.snap_save()
         self.session.run(self.swa_load_op)
@@ -730,6 +751,63 @@ def calculate_swa_summaries(self, test_batches, steps):
         self.test_writer = true_test_writer
         self.snap_restore()
 
+    def calculate_test_summaries_v2(self, test_batches, steps):
+        sum_policy_accuracy = 0
+        sum_value_accuracy = 0
+        sum_mse = 0
+        sum_policy = 0
+        sum_value = 0
+        for _ in range(0, test_batches):
+            x, y, z, q = next(self.test_iter)
+            policy, value = self.model(x)
+            policy_loss = self.policy_loss_fn(y, policy)                    
+            reg_term = sum(self.model.losses)
+            if self.wdl:
+                value_loss = self.value_loss_fn(self.qMix(z, q), value)
+                mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
+                total_loss = self.lossMix(policy_loss, value_loss) + reg_term
+            else:
+                value_loss = self.value_loss_fn(self.qMix(z, q), value)
+                mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
+                total_loss = self.lossMix(policy_loss, mse_loss) + reg_term
+            #sum_policy_accuracy += test_policy_accuracy
+            sum_mse += mse_loss
+            sum_policy += policy_loss
+            if self.wdl:
+                #sum_value_accuracy += test_value_accuracy
+                sum_value += value_loss
+        #sum_policy_accuracy /= test_batches
+        #sum_policy_accuracy *= 100
+        sum_policy /= test_batches
+        sum_value /= test_batches
+        #if self.wdl:
+            #sum_value_accuracy /= test_batches
+            #sum_value_accuracy *= 100
+        # Additionally rescale to [0, 1] so divide by 4
+        sum_mse /= (4.0 * test_batches)
+        self.net.pb.training_params.learning_rate = self.lr
+        self.net.pb.training_params.mse_loss = sum_mse
+        self.net.pb.training_params.policy_loss = sum_policy
+        # TODO store value and value accuracy in pb
+        #self.net.pb.training_params.accuracy = sum_policy_accuracy
+        #if self.wdl:
+        #    test_summaries = tf.compat.v1.Summary(value=[
+        #        tf.compat.v1.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
+        #        tf.compat.v1.Summary.Value(tag="Value Accuracy", simple_value=sum_value_accuracy),
+        #        tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
+        #        tf.compat.v1.Summary.Value(tag="Value Loss", simple_value=sum_value),
+        #        tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=sum_mse)]).SerializeToString()
+        #else:
+        #    test_summaries = tf.compat.v1.Summary(value=[
+        #        tf.compat.v1.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
+        #        tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
+        #        tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=sum_mse)]).SerializeToString()
+        #test_summaries = tf.compat.v1.summary.merge(
+        #    [test_summaries] + self.histograms).eval(session=self.session)
+        #self.test_writer.add_summary(test_summaries, steps)
+        print("step {}, policy={:g} value={:g} policy accuracy={:g}% value accuracy={:g}% mse={:g}".\
+            format(steps, sum_policy, sum_value, sum_policy_accuracy, sum_value_accuracy, sum_mse))
+
     def calculate_test_summaries(self, test_batches, steps):
         sum_policy_accuracy = 0
         sum_value_accuracy = 0
@@ -830,6 +908,12 @@ def log_histogram(self, tag, values, bins=1000):
 
         return tf.compat.v1.Summary.Value(tag=tag, histo=hist)
 
+    def update_swa_v2(self):
+        num = self.swa_count.read_value()
+        for (w, swa) in zip(self.model.weights, self.swa_weights):
+            swa.assign(swa.read_value() * (num / (num + 1.)) + w.read_value() * (1. / (num + 1.)))
+        self.swa_count.assign(min(num + 1., self.swa_max_n))
+
     def update_swa(self):
         # Add the current weight vars to the running average.
         num = self.session.run(self.swa_accum_op)

From 2d97b80bc7ddad3ee9e4aa76aa4b79efc1eef95d Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sun, 8 Dec 2019 22:57:56 +1100
Subject: [PATCH 10/39] Actually sum the gradients...

---
 tf/tfprocess.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index dc84be67..2e9a59f4 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -525,7 +525,8 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             if not grads:
                 grads = tape.gradient(total_loss, self.model.trainable_weights)
             else:
-                grads += tape.gradient(total_loss, self.model.trainable_weights)
+                new_grads = tape.gradient(total_loss, self.model.trainable_weights)
+                grads = [tf.math.add(a, b) for (a, b) in zip(grads, new_grads)]
             # Keep running averages
             # Google's paper scales MSE by 1/4 to a [0, 1] range, so do the same to
             # get comparable values.

From cd3c69dab0f7361d9c600e7b31961d511d68340c Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Sun, 8 Dec 2019 23:31:39 +1100
Subject: [PATCH 11/39] Fix bad bug with residual blocks, also include the stub
 for the incomplete save code.

---
 tf/tfprocess.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 2e9a59f4..13f5a27e 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -596,11 +596,11 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
         if steps % self.cfg['training']['total_steps'] == 0 or (
                 'checkpoint_steps' in self.cfg['training'] and steps % self.cfg['training']['checkpoint_steps'] == 0):
             self.manager.save()
-            #print("Model saved in file: {}".format(save_path))
-            #leela_path = path + "-" + str(steps)
+            print("Model saved in file: {}".format(self.manager.latest_checkpoint))
+            leela_path = self.manager.latest_checkpoint + "-" + str(steps)
             #swa_path = path + "-swa-" + str(steps)
             self.net.pb.training_params.training_steps = steps
-            #self.save_leelaz_weights(leela_path)
+            self.save_leelaz_weights_v2(leela_path)
             #print("Weights saved in file: {}".format(leela_path))
             #if self.swa_enabled:
             #    self.save_swa_weights(swa_path)
@@ -947,6 +947,10 @@ def save_swa_weights(self, filename):
         self.save_leelaz_weights(filename)
         self.snap_restore()
 
+    def save_leelaz_weights_v2(self, filename):
+        for w in self.model.weights:
+            tf.print(w.name)
+
     def save_leelaz_weights(self, filename):
         all_weights = []
         if not hasattr(self, 'pb_save_op'):
@@ -1128,7 +1132,7 @@ def residual_block_v2(self, inputs, channels):
         conv1 = tf.keras.layers.Conv2D(channels, 3, use_bias=False, padding='same', kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, data_format='channels_first')(inputs)
         out1 = tf.keras.layers.Activation('relu')(self.batch_norm_v2(conv1, scale=False))
         conv2 = tf.keras.layers.Conv2D(channels, 3, use_bias=False, padding='same', kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, data_format='channels_first')(out1)
-        out2 = self.squeeze_excitation_v2(self.batch_norm_v2(conv1, scale=True), channels)
+        out2 = self.squeeze_excitation_v2(self.batch_norm_v2(conv2, scale=True), channels)
         return tf.keras.layers.Activation('relu')(tf.keras.layers.add([inputs, out2]))
         
 

From d3f95eb8a20b5fc262d3c3767de172a8bc60c531 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Mon, 9 Dec 2019 00:00:03 +1100
Subject: [PATCH 12/39] Constant policy map shouldn't be a variable, or it gets
 saved.

---
 tf/tfprocess.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 13f5a27e..36a8b268 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -49,8 +49,7 @@ def call(self, inputs):
 class ApplyPolicyMap(tf.keras.layers.Layer):
     def __init__(self, **kwargs):
         super(ApplyPolicyMap, self).__init__(**kwargs)
-        fc1_init = tf.constant(lc0_az_policy_map.make_map())
-        self.fc1 = tf.Variable(fc1_init, trainable=False)
+        self.fc1 = tf.constant(lc0_az_policy_map.make_map())
 
     def call(self, inputs):
         h_conv_pol_flat = tf.reshape(inputs, [-1, 80*8*8])

From c90a9c49fb1aed38f92bf68f69a257d3e554b316 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Mon, 9 Dec 2019 20:22:41 +1100
Subject: [PATCH 13/39] Add untested network saving

no idea if its output is the right format yet.
---
 tf/tfprocess.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 77 insertions(+), 5 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 36a8b268..dca5a453 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -441,7 +441,6 @@ def restore_v2(self):
             print("Restoring from {0}".format(self.manager.latest_checkpoint))
             self.checkpoint.restore(self.manager.latest_checkpoint)
 
-
     def restore(self, file):
         print("Restoring from {0}".format(file))
         self.saver.restore(self.session, file)
@@ -596,9 +595,10 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
                 'checkpoint_steps' in self.cfg['training'] and steps % self.cfg['training']['checkpoint_steps'] == 0):
             self.manager.save()
             print("Model saved in file: {}".format(self.manager.latest_checkpoint))
-            leela_path = self.manager.latest_checkpoint + "-" + str(steps)
+            evaled_steps = steps.numpy()
+            leela_path = self.manager.latest_checkpoint + "-" + str(evaled_steps)
             #swa_path = path + "-swa-" + str(steps)
-            self.net.pb.training_params.training_steps = steps
+            self.net.pb.training_params.training_steps = evaled_steps
             self.save_leelaz_weights_v2(leela_path)
             #print("Weights saved in file: {}".format(leela_path))
             #if self.swa_enabled:
@@ -940,6 +940,14 @@ def snap_restore(self):
         # Restore variables in the current graph from the snapshot.
         self.session.run(self.snap_restore_op)
 
+    def save_swa_weights_v2(self, filename):
+        backup = [w.read_value() for w in self.model.weights]
+        for (swa, w) in zip(self.swa_weights, self.model.weights):
+            w.assign(swa.read_value())
+        self.save_leelaz_weights_v2(self, filename)
+        for (old, w) in zip(backup, self.model.weights):
+            w.assign(old)
+
     def save_swa_weights(self, filename):
         self.snap_save()
         self.session.run(self.swa_load_op)
@@ -947,8 +955,72 @@ def save_swa_weights(self, filename):
         self.snap_restore()
 
     def save_leelaz_weights_v2(self, filename):
-        for w in self.model.weights:
-            tf.print(w.name)
+        all_tensors = []
+        all_weights = []
+        last_was_gamma = False
+        for weights in self.model.weights:
+            work_weights = None
+            if weights.shape.ndims == 4:
+                # Convolution weights need a transpose
+                #
+                # TF (kYXInputOutput)
+                # [filter_height, filter_width, in_channels, out_channels]
+                #
+                # Leela/cuDNN/Caffe (kOutputInputYX)
+                # [output, input, filter_size, filter_size]
+                work_weights = tf.transpose(a=weights, perm=[3, 2, 0, 1])
+            elif weights.shape.ndims == 2:
+                # Fully connected layers are [in, out] in TF
+                #
+                # [out, in] in Leela
+                #
+                work_weights = tf.transpose(a=weights, perm=[1, 0])
+            else:
+                # Biases, batchnorm etc
+                # pb expects every batch norm to have gammas, but not all of our
+                # batch norms have gammas, so manually add pretend gammas.
+                if 'beta:' in weights.name and not last_was_gamma:
+                    all_tensors.append(tf.ones_like(weights))
+                work_weights = weights.read_value()
+            all_tensors.append(work_weights)
+            last_was_gamma = 'gamma:' in weights.name
+
+        # HACK: model weights ordering is some kind of breadth first traversal,
+        # but pb expects a specific ordering which BFT is not a match for once
+        # we get to the heads. Apply manual permutation.
+        # This is fragile and at minimum should have some checks to ensure it isn't breaking things.
+        #TODO: also support classic policy head as it has a different set of layers and hence changes the permutation.
+        permuted_tensors = [w for w in all_tensors]
+        permuted_tensors[-5] = all_tensors[-10]
+        permuted_tensors[-6] = all_tensors[-11]
+        permuted_tensors[-7] = all_tensors[-12]
+        permuted_tensors[-8] = all_tensors[-14]
+        permuted_tensors[-9] = all_tensors[-5]
+        permuted_tensors[-10] = all_tensors[-6]
+        permuted_tensors[-11] = all_tensors[-7]        
+        permuted_tensors[-12] = all_tensors[-8]        
+        permuted_tensors[-13] = all_tensors[-9]        
+        permuted_tensors[-14] = all_tensors[-13]        
+        all_tensors = permuted_tensors
+        
+        for e, nparray in enumerate(all_tensors):
+            # Rescale rule50 related weights as clients do not normalize the input.
+            if e == 0:
+                num_inputs = 112
+                # 50 move rule is the 110th input, or 109 starting from 0.
+                rule50_input = 109
+                wt_flt = []
+                for i, weight in enumerate(np.ravel(nparray)):
+                    if (i % (num_inputs*9))//9 == rule50_input:
+                        wt_flt.append(weight/99)
+                    else:
+                        wt_flt.append(weight)
+            else:
+                wt_flt = [wt for wt in np.ravel(nparray)]
+            all_weights.append(wt_flt)
+
+        self.net.fill_net(all_weights)
+        self.net.save_proto(filename)
 
     def save_leelaz_weights(self, filename):
         all_weights = []

From 61158e22459be00646a200d5a4402924a925a823 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Mon, 9 Dec 2019 21:37:24 +1100
Subject: [PATCH 14/39] Fix permutation which didn't take into account the
 inserted gammas.

---
 tf/tfprocess.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index dca5a453..633080e7 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -597,7 +597,7 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             print("Model saved in file: {}".format(self.manager.latest_checkpoint))
             evaled_steps = steps.numpy()
             leela_path = self.manager.latest_checkpoint + "-" + str(evaled_steps)
-            #swa_path = path + "-swa-" + str(steps)
+            #swa_path = path + "-swa-" + str(evaled_steps)
             self.net.pb.training_params.training_steps = evaled_steps
             self.save_leelaz_weights_v2(leela_path)
             #print("Weights saved in file: {}".format(leela_path))
@@ -991,16 +991,18 @@ def save_leelaz_weights_v2(self, filename):
         # This is fragile and at minimum should have some checks to ensure it isn't breaking things.
         #TODO: also support classic policy head as it has a different set of layers and hence changes the permutation.
         permuted_tensors = [w for w in all_tensors]
-        permuted_tensors[-5] = all_tensors[-10]
-        permuted_tensors[-6] = all_tensors[-11]
-        permuted_tensors[-7] = all_tensors[-12]
+        permuted_tensors[-5] = all_tensors[-11]
+        permuted_tensors[-6] = all_tensors[-12]
+        permuted_tensors[-7] = all_tensors[-13]
         permuted_tensors[-8] = all_tensors[-14]
-        permuted_tensors[-9] = all_tensors[-5]
-        permuted_tensors[-10] = all_tensors[-6]
-        permuted_tensors[-11] = all_tensors[-7]        
-        permuted_tensors[-12] = all_tensors[-8]        
-        permuted_tensors[-13] = all_tensors[-9]        
-        permuted_tensors[-14] = all_tensors[-13]        
+        permuted_tensors[-9] = all_tensors[-16]
+        permuted_tensors[-10] = all_tensors[-5]
+        permuted_tensors[-11] = all_tensors[-6]        
+        permuted_tensors[-12] = all_tensors[-7]        
+        permuted_tensors[-13] = all_tensors[-8]        
+        permuted_tensors[-14] = all_tensors[-9]        
+        permuted_tensors[-15] = all_tensors[-10]        
+        permuted_tensors[-16] = all_tensors[-15]        
         all_tensors = permuted_tensors
         
         for e, nparray in enumerate(all_tensors):

From 8681df6c93e9640dea47c6c7af52e23fa9ead25c Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Mon, 9 Dec 2019 22:02:58 +1100
Subject: [PATCH 15/39] GlobalAveragePooling2D needs to know the data format.

---
 tf/tfprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 633080e7..2735e8db 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -1142,7 +1142,7 @@ def batch_norm(self, net, scope, scale=False):
                         
 
     def squeeze_excitation_v2(self, inputs, channels):
-        pooled = tf.keras.layers.GlobalAveragePooling2D()(inputs)
+        pooled = tf.keras.layers.GlobalAveragePooling2D(data_format='channels_first')(inputs)
         squeezed = tf.keras.layers.Activation('relu')(tf.keras.layers.Dense(channels // self.SE_ratio, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg)(pooled))
         excited = tf.keras.layers.Dense(2 * channels, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg)(squeezed)
         return ApplySqueezeExcitation()([inputs, excited])

From 215445d41e10a3790741040ea3db68a58e2bd99a Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Tue, 10 Dec 2019 10:17:06 +1100
Subject: [PATCH 16/39] Use tf.function on the inner loop of training for
 massive speed up

---
 tf/tfprocess.py | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 2735e8db..02ded6ad 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -461,6 +461,26 @@ def process_loop(self, batch_size, test_batches, batch_splits=1):
         for _ in range(steps % total_steps, total_steps):
             self.process(batch_size, test_batches, batch_splits=batch_splits)
 
+    @tf.function()
+    def process_inner_loop(self):
+        print('tracing inner loop!')
+        x, y, z, q = next(self.train_iter)
+        with tf.GradientTape() as tape:
+            policy, value = self.model(x)
+            policy_loss = self.policy_loss_fn(y, policy)                    
+            reg_term = sum(self.model.losses)
+            if self.wdl:
+                value_loss = self.value_loss_fn(self.qMix(z, q), value)
+                total_loss = self.lossMix(policy_loss, value_loss) + reg_term
+            else:
+                mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
+                total_loss = self.lossMix(policy_loss, mse_loss) + reg_term
+        if self.wdl:
+            mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
+        else:
+            value_loss = self.value_loss_fn(self.qMix(z, q), value)
+        return policy_loss, value_loss, mse_loss, reg_term, tape.gradient(total_loss, self.model.trainable_weights)
+
     def process_v2(self, batch_size, test_batches, batch_splits=1):
         if not self.time_start:
             self.time_start = time.time()
@@ -505,25 +525,10 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
         # Run training for this batch
         grads = None
         for _ in range(batch_splits):
-            x, y, z, q = next(self.train_iter)
-            with tf.GradientTape() as tape:
-                policy, value = self.model(x)
-                policy_loss = self.policy_loss_fn(y, policy)                    
-                reg_term = sum(self.model.losses)
-                if self.wdl:
-                    value_loss = self.value_loss_fn(self.qMix(z, q), value)
-                    total_loss = self.lossMix(policy_loss, value_loss) + reg_term
-                else:
-                    mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
-                    total_loss = self.lossMix(policy_loss, mse_loss) + reg_term
-            if self.wdl:
-                mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
-            else:
-                value_loss = self.value_loss_fn(self.qMix(z, q), value)
+            policy_loss, value_loss, mse_loss, reg_term, new_grads = self.process_inner_loop()
             if not grads:
-                grads = tape.gradient(total_loss, self.model.trainable_weights)
+                grads = new_grads
             else:
-                new_grads = tape.gradient(total_loss, self.model.trainable_weights)
                 grads = [tf.math.add(a, b) for (a, b) in zip(grads, new_grads)]
             # Keep running averages
             # Google's paper scales MSE by 1/4 to a [0, 1] range, so do the same to

From bdec90303ededf7c613db79a2b3f3f76087b10f7 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Tue, 10 Dec 2019 10:56:26 +1100
Subject: [PATCH 17/39] Extract test summary inner loop to tf.function for a
 bit more performance

---
 tf/tfprocess.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 02ded6ad..54d7fc58 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -756,6 +756,20 @@ def calculate_swa_summaries(self, test_batches, steps):
         self.test_writer = true_test_writer
         self.snap_restore()
 
+    @tf.function()
+    def calculate_test_summaries_inner_loop(self):
+        print('tracing summaries inner loop!')
+        x, y, z, q = next(self.test_iter)
+        policy, value = self.model(x)
+        policy_loss = self.policy_loss_fn(y, policy)                    
+        if self.wdl:
+            value_loss = self.value_loss_fn(self.qMix(z, q), value)
+            mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
+        else:
+            value_loss = self.value_loss_fn(self.qMix(z, q), value)
+            mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
+        return policy_loss, value_loss, mse_loss
+
     def calculate_test_summaries_v2(self, test_batches, steps):
         sum_policy_accuracy = 0
         sum_value_accuracy = 0
@@ -763,18 +777,7 @@ def calculate_test_summaries_v2(self, test_batches, steps):
         sum_policy = 0
         sum_value = 0
         for _ in range(0, test_batches):
-            x, y, z, q = next(self.test_iter)
-            policy, value = self.model(x)
-            policy_loss = self.policy_loss_fn(y, policy)                    
-            reg_term = sum(self.model.losses)
-            if self.wdl:
-                value_loss = self.value_loss_fn(self.qMix(z, q), value)
-                mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
-                total_loss = self.lossMix(policy_loss, value_loss) + reg_term
-            else:
-                value_loss = self.value_loss_fn(self.qMix(z, q), value)
-                mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
-                total_loss = self.lossMix(policy_loss, mse_loss) + reg_term
+            policy_loss, value_loss, mse_loss = self.calculate_test_summaries_inner_loop()
             #sum_policy_accuracy += test_policy_accuracy
             sum_mse += mse_loss
             sum_policy += policy_loss

From 0b173560d4877e8251426576bd721555d60c8118 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 11:08:47 +1100
Subject: [PATCH 18/39] Basic tensorboard summary data writting and some other
 cleanup related to saving.

---
 tf/tfprocess.py | 47 +++++++++++++++++++++++++++--------------------
 tf/train.py     | 10 +++++-----
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 54d7fc58..359c9be5 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -136,7 +136,7 @@ def __init__(self, cfg):
         gpus = tf.config.experimental.list_physical_devices('GPU')
         tf.config.experimental.set_visible_devices(gpus[self.cfg['gpu']], 'GPU')
     
-        self.global_step = tf.Variable(0, name='global_step', trainable=False)
+        self.global_step = tf.Variable(0, name='global_step', trainable=False, dtype=tf.int64)
 
     def init_v2(self, train_dataset, test_dataset):
         self.l2reg = tf.keras.regularizers.l2(l=0.5 * (0.0001))
@@ -242,6 +242,13 @@ def mse_loss(target, output):
         self.cfg['training']['lr_boundaries'].sort()
         self.warmup_steps = self.cfg['training'].get('warmup_steps', 0)
         self.lr = self.cfg['training']['lr_values'][0]
+        self.test_writer = tf.summary.create_file_writer(
+            os.path.join(os.getcwd(), "leelalogs/{}-test".format(self.cfg['name'])))
+        self.train_writer = tf.summary.create_file_writer(
+            os.path.join(os.getcwd(), "leelalogs/{}-train".format(self.cfg['name'])))
+        if self.swa_enabled:
+            self.swa_writer = tf.summary.create_file_writer(
+                os.path.join(os.getcwd(), "leelalogs/{}-swa-test".format(self.cfg['name'])))
 
     def init_net(self, next_batch):
         self.x = next_batch[0]  # tf.placeholder(tf.float32, [None, 112, 8*8])
@@ -570,16 +577,14 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             #after_weights = self.session.run(self.weights)
             #update_ratio_summaries = self.compute_update_ratio(
             #    before_weights, after_weights)
-
-            #train_summaries = tf.compat.v1.Summary(value=[
-            #    tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=avg_policy_loss),
-            #    tf.compat.v1.Summary.Value(tag="Value Loss", simple_value=avg_value_loss),
-            #    tf.compat.v1.Summary.Value(tag="Reg term", simple_value=avg_reg_term),
-            #    tf.compat.v1.Summary.Value(tag="LR", simple_value=self.lr),
-            #    tf.compat.v1.Summary.Value(tag="Gradient norm",
-            #                     simple_value=grad_norm / batch_splits),
-            #    tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=avg_mse_loss)])
-            #self.train_writer.add_summary(train_summaries, steps)
+            with self.train_writer.as_default():
+                tf.summary.scalar("Policy Loss", avg_policy_loss, step=steps)
+                tf.summary.scalar("Value Loss", avg_value_loss, step=steps)
+                tf.summary.scalar("Reg term", avg_reg_term, step=steps)
+                tf.summary.scalar("LR", self.lr, step=steps)
+                tf.summary.scalar("Gradient norm", grad_norm / batch_splits, step=steps)
+                tf.summary.scalar("MSE Loss", avg_mse_loss, step=steps)
+            self.train_writer.flush()
             #self.train_writer.add_summary(update_ratio_summaries, steps)
             self.time_start = time_end
             self.last_steps = steps
@@ -602,13 +607,13 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             print("Model saved in file: {}".format(self.manager.latest_checkpoint))
             evaled_steps = steps.numpy()
             leela_path = self.manager.latest_checkpoint + "-" + str(evaled_steps)
-            #swa_path = path + "-swa-" + str(evaled_steps)
+            swa_path = self.manager.latest_checkpoint + "-swa-" + str(evaled_steps)
             self.net.pb.training_params.training_steps = evaled_steps
             self.save_leelaz_weights_v2(leela_path)
-            #print("Weights saved in file: {}".format(leela_path))
-            #if self.swa_enabled:
-            #    self.save_swa_weights(swa_path)
-            #    print("SWA Weights saved in file: {}".format(swa_path))
+            print("Weights saved in file: {}".format(leela_path))
+            if self.swa_enabled:
+                self.save_swa_weights(swa_path)
+                print("SWA Weights saved in file: {}".format(swa_path))
 
     def process(self, batch_size, test_batches, batch_splits=1):
         if not self.time_start:
@@ -798,18 +803,20 @@ def calculate_test_summaries_v2(self, test_batches, steps):
         self.net.pb.training_params.policy_loss = sum_policy
         # TODO store value and value accuracy in pb
         #self.net.pb.training_params.accuracy = sum_policy_accuracy
+        with self.test_writer.as_default():
+            tf.summary.scalar("Policy Loss", sum_policy, step=steps)
+            tf.summary.scalar("Value Loss", sum_value, step=steps)
+            tf.summary.scalar("MSE Loss", sum_mse, step=steps)
+        self.test_writer.flush()
+
         #if self.wdl:
         #    test_summaries = tf.compat.v1.Summary(value=[
         #        tf.compat.v1.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
         #        tf.compat.v1.Summary.Value(tag="Value Accuracy", simple_value=sum_value_accuracy),
-        #        tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
-        #        tf.compat.v1.Summary.Value(tag="Value Loss", simple_value=sum_value),
-        #        tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=sum_mse)]).SerializeToString()
         #else:
         #    test_summaries = tf.compat.v1.Summary(value=[
         #        tf.compat.v1.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
         #        tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
-        #        tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=sum_mse)]).SerializeToString()
         #test_summaries = tf.compat.v1.summary.merge(
         #    [test_summaries] + self.histograms).eval(session=self.session)
         #self.test_writer.add_summary(test_summaries, steps)
diff --git a/tf/train.py b/tf/train.py
index b9a7db0c..a4c98462 100755
--- a/tf/train.py
+++ b/tf/train.py
@@ -156,11 +156,11 @@ def main(cmd):
     #tfprocess.process_loop(total_batch_size, num_evals, batch_splits=batch_splits)
     tfprocess.process_loop_v2(total_batch_size, num_evals, batch_splits=batch_splits)
 
-    #if cmd.output is not None:
-    #    if cfg['training'].get('swa_output', False):
-    #        tfprocess.save_swa_weights(cmd.output)
-    #    else:
-    #        tfprocess.save_leelaz_weights(cmd.output)
+    if cmd.output is not None:
+        if cfg['training'].get('swa_output', False):
+            tfprocess.save_swa_weights_v2(cmd.output)
+        else:
+            tfprocess.save_leelaz_weights_v2(cmd.output)
 
     #tfprocess.session.close()
     train_parser.shutdown()

From 43cdaf24f17a9cd705f8565ac63a0ee44fda6621 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 11:32:23 +1100
Subject: [PATCH 19/39] Remove some v1 code which I've done a second pass
 checking conversion.

---
 tf/tfprocess.py | 220 +-----------------------------------------------
 1 file changed, 2 insertions(+), 218 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 359c9be5..25f32877 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -1085,19 +1085,6 @@ def save_leelaz_weights(self, filename):
         self.net.fill_net(all_weights)
         self.net.save_proto(filename)
 
-    def get_batchnorm_key(self):
-        result = "bn" + str(self.batch_norm_count)
-        self.batch_norm_count += 1
-        return result
-
-    def add_weights(self, var):
-        if var.name[-11:] == "fp16_cast:0":
-            name = var.name[:-12] + ":0"
-            var = tf.compat.v1.get_default_graph().get_tensor_by_name(name)
-        # All trainable variables should be stored as fp32
-        assert var.dtype.base_dtype == tf.float32
-        self.weights.append(var)
-
     def batch_norm_v2(self, input, scale=False):
         if self.renorm_enabled:
             clipping = {
@@ -1114,108 +1101,18 @@ def batch_norm_v2(self, input, scale=False):
                 epsilon=1e-5, axis=1, fused=False, center=True,
                 scale=scale, virtual_batch_size=64)(input)
             
-
-    def batch_norm(self, net, scope, scale=False):
-        # The weights are internal to the batchnorm layer, so apply
-        # a unique scope that we can store, and use to look them back up
-        # later on.
-
-        with tf.compat.v1.variable_scope(scope, custom_getter=float32_variable_storage_getter):
-            if self.renorm_enabled:
-                clipping = {
-                    "rmin": 1.0/self.renorm_max_r,
-                    "rmax": self.renorm_max_r,
-                    "dmax": self.renorm_max_d
-                    }
-                # Renorm has issues with fp16, cast to fp32.
-                net = tf.compat.v1.layers.batch_normalization(
-                    tf.cast(net, tf.float32), epsilon=1e-5, axis=1, fused=True,
-                    center=True, scale=scale,
-                    renorm=True, renorm_clipping=clipping,
-                    renorm_momentum=self.renorm_momentum,
-                    training=self.training)
-                net = tf.cast(net, self.model_dtype)
-            else:
-                # Virtual batch doesn't work with fp16
-                virtual_batch = 64 if self.model_dtype == tf.float32 else None
-                net = tf.compat.v1.layers.batch_normalization(
-                    net, epsilon=1e-5, axis=1, fused=True,
-                    center=True, scale=scale,
-                    virtual_batch_size=virtual_batch,
-                    training=self.training)
-
-        for v in ['gamma', 'beta', 'moving_mean', 'moving_variance' ]:
-            if v == 'gamma' and not scale:
-                var = tf.Variable(tf.ones(shape=[net.shape[1]]),
-                                    name=scope + '/fixed_gamma', trainable=False,
-                                    dtype=tf.float32)
-            else:
-                name = "fp32_storage/" + scope + '/batch_normalization/' + v + ':0'
-                var = tf.compat.v1.get_default_graph().get_tensor_by_name(name)
-            self.add_weights(var)
-        return net
-                        
-
     def squeeze_excitation_v2(self, inputs, channels):
+        assert channels % self.SE_ratio == 0
+        
         pooled = tf.keras.layers.GlobalAveragePooling2D(data_format='channels_first')(inputs)
         squeezed = tf.keras.layers.Activation('relu')(tf.keras.layers.Dense(channels // self.SE_ratio, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg)(pooled))
         excited = tf.keras.layers.Dense(2 * channels, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg)(squeezed)
         return ApplySqueezeExcitation()([inputs, excited])
 
-    def squeeze_excitation(self, x, channels, ratio):
-
-        assert channels % ratio == 0
-
-        # NCHW format reduced to NC
-        net = tf.reduce_mean(input_tensor=x, axis=[2, 3])
-
-        W_fc1 = weight_variable([channels, channels // ratio], name='se_fc1_w',
-                                  dtype=self.model_dtype)
-        b_fc1 = bias_variable([channels // ratio], name='se_fc1_b',
-                                  dtype=self.model_dtype)
-        self.add_weights(W_fc1)
-        self.add_weights(b_fc1)
-
-        net = tf.nn.relu(tf.add(tf.matmul(net, W_fc1), b_fc1))
-
-        W_fc2 = weight_variable(
-            [channels // ratio, 2 * channels], name='se_fc2_w',
-                                  dtype=self.model_dtype)
-        b_fc2 = bias_variable([2 * channels], name='se_fc2_b',
-                                  dtype=self.model_dtype)
-        self.add_weights(W_fc2)
-        self.add_weights(b_fc2)
-
-        net = tf.add(tf.matmul(net, W_fc2), b_fc2)
-        net = tf.reshape(net, [-1, 2 * channels, 1, 1])
-
-        # Split to scale and bias
-        gammas, betas = tf.split(net, 2, axis=1)
-
-        out = tf.nn.sigmoid(gammas) * x + betas
-
-        return out
-
     def conv_block_v2(self, inputs, filter_size, output_channels, bn_scale=False):
         conv = tf.keras.layers.Conv2D(output_channels, filter_size, use_bias=False, padding='same', kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, data_format='channels_first')(inputs)
         return tf.keras.layers.Activation('relu')(self.batch_norm_v2(conv, scale=bn_scale))
 
-    def conv_block(self, inputs, filter_size, input_channels, output_channels, bn_scale=False):
-        # The weights are internal to the batchnorm layer, so apply
-        # a unique scope that we can store, and use to look them back up
-        # later on.
-        weight_key = self.get_batchnorm_key()
-        conv_key = weight_key + "/conv_weight"
-        W_conv = weight_variable([filter_size, filter_size,
-                                  input_channels, output_channels], name=conv_key,
-                                  dtype=self.model_dtype)
-
-        self.add_weights(W_conv)
-        h_bn = self.batch_norm(conv2d(inputs, W_conv), weight_key, scale=bn_scale)
-        h_conv = tf.nn.relu(h_bn)
-
-        return h_conv
-
     def residual_block_v2(self, inputs, channels):
         conv1 = tf.keras.layers.Conv2D(channels, 3, use_bias=False, padding='same', kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, data_format='channels_first')(inputs)
         out1 = tf.keras.layers.Activation('relu')(self.batch_norm_v2(conv1, scale=False))
@@ -1223,34 +1120,6 @@ def residual_block_v2(self, inputs, channels):
         out2 = self.squeeze_excitation_v2(self.batch_norm_v2(conv2, scale=True), channels)
         return tf.keras.layers.Activation('relu')(tf.keras.layers.add([inputs, out2]))
         
-
-    def residual_block(self, inputs, channels):
-        # First convnet
-        orig = tf.identity(inputs)
-        weight_key_1 = self.get_batchnorm_key()
-        conv_key_1 = weight_key_1 + "/conv_weight"
-        W_conv_1 = weight_variable([3, 3, channels, channels], name=conv_key_1,
-                dtype=self.model_dtype)
-
-        # Second convnet
-        weight_key_2 = self.get_batchnorm_key()
-        conv_key_2 = weight_key_2 + "/conv_weight"
-        W_conv_2 = weight_variable([3, 3, channels, channels], name=conv_key_2,
-                dtype=self.model_dtype)
-
-        self.add_weights(W_conv_1)
-        h_bn1 = self.batch_norm(conv2d(inputs, W_conv_1), weight_key_1, scale=False)
-        h_out_1 = tf.nn.relu(h_bn1)
-
-        self.add_weights(W_conv_2)
-        h_bn2 = self.batch_norm(conv2d(h_out_1, W_conv_2), weight_key_2, scale=True)
-
-        with tf.compat.v1.variable_scope(weight_key_2):
-            h_se = self.squeeze_excitation(h_bn2, channels, self.SE_ratio)
-        h_out_2 = tf.nn.relu(tf.add(h_se, orig))
-
-        return h_out_2
-
     def construct_net_v2(self, inputs):
         flow = self.conv_block_v2(inputs, filter_size=3, output_channels=self.RESIDUAL_FILTERS, bn_scale=True)
         for _ in range(0, self.RESIDUAL_BLOCKS):
@@ -1278,88 +1147,3 @@ def construct_net_v2(self, inputs):
             h_fc3 = tf.keras.layers.Dense(1, kernel_initializer='glorot_normal', kernel_regularizer=self.l2reg, activation='tanh')(h_fc2)
         return h_fc1, h_fc3
         
-
-    def construct_net(self, planes):
-        # NCHW format
-        # batch, 112 input channels, 8 x 8
-        x_planes = tf.reshape(planes, [-1, 112, 8, 8])
-        x_planes = tf.cast(x_planes, dtype=self.model_dtype)
-
-        # Input convolution
-        flow = self.conv_block(x_planes, filter_size=3,
-                               input_channels=112,
-                               output_channels=self.RESIDUAL_FILTERS,
-                               bn_scale=True)
-        # Residual tower
-        for _ in range(0, self.RESIDUAL_BLOCKS):
-            flow = self.residual_block(flow, self.RESIDUAL_FILTERS)
-
-        # Policy head
-        if self.POLICY_HEAD == pb.NetworkFormat.POLICY_CONVOLUTION:
-            conv_pol = self.conv_block(flow, filter_size=3,
-                                       input_channels=self.RESIDUAL_FILTERS,
-                                       output_channels=self.RESIDUAL_FILTERS)
-            W_pol_conv = weight_variable([3, 3,
-                                          self.RESIDUAL_FILTERS, 80], name='W_pol_conv2',
-                                          dtype=self.model_dtype)
-            b_pol_conv = bias_variable([80], name='b_pol_conv2',
-                                          dtype=self.model_dtype)
-
-            self.add_weights(W_pol_conv)
-            tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, b_pol_conv)
-            self.add_weights(b_pol_conv)
-
-            conv_pol2 = tf.nn.bias_add(
-                conv2d(conv_pol, W_pol_conv), b_pol_conv, data_format='NCHW')
-
-            h_conv_pol_flat = tf.reshape(conv_pol2, [-1, 80*8*8])
-            fc1_init = tf.constant(lc0_az_policy_map.make_map(), dtype=self.model_dtype)
-            W_fc1 = tf.Variable(fc1_init, trainable=False, name="policy_map")
-
-            h_fc1 = tf.matmul(h_conv_pol_flat, W_fc1, name='policy_head')
-        elif self.POLICY_HEAD == pb.NetworkFormat.POLICY_CLASSICAL:
-            conv_pol = self.conv_block(flow, filter_size=1,
-                                       input_channels=self.RESIDUAL_FILTERS,
-                                       output_channels=self.policy_channels)
-            h_conv_pol_flat = tf.reshape(
-                conv_pol, [-1, self.policy_channels*8*8])
-            W_fc1 = weight_variable(
-                [self.policy_channels*8*8, 1858], name='fc1/weight',
-                dtype=self.model_dtype)
-            b_fc1 = bias_variable([1858], name='fc1/bias',
-                                  dtype=self.model_dtype)
-            self.add_weights(W_fc1)
-            tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, b_fc1)
-            self.add_weights(b_fc1)
-            h_fc1 = tf.add(tf.matmul(h_conv_pol_flat, W_fc1),
-                           b_fc1, name='policy_head')
-        else:
-            raise ValueError(
-                "Unknown policy head type {}".format(self.POLICY_HEAD))
-
-        # Value head
-        conv_val = self.conv_block(flow, filter_size=1,
-                                   input_channels=self.RESIDUAL_FILTERS,
-                                   output_channels=32)
-        h_conv_val_flat = tf.reshape(conv_val, [-1, 32*8*8])
-        W_fc2 = weight_variable([32 * 8 * 8, 128], name='fc2/weight',
-                dtype=self.model_dtype)
-        b_fc2 = bias_variable([128], name='fc2/bias', dtype=self.model_dtype)
-        self.add_weights(W_fc2)
-        self.add_weights(b_fc2)
-        h_fc2 = tf.nn.relu(tf.add(tf.matmul(h_conv_val_flat, W_fc2), b_fc2))
-        value_outputs = 3 if self.wdl else 1
-        W_fc3 = weight_variable([128, value_outputs], name='fc3/weight',
-                dtype=self.model_dtype)
-        b_fc3 = bias_variable([value_outputs], name='fc3/bias',
-                dtype=self.model_dtype)
-        self.add_weights(W_fc3)
-        self.add_weights(b_fc3)
-        h_fc3 = tf.add(tf.matmul(h_fc2, W_fc3), b_fc3, name='value_head')
-        if not self.wdl:
-            h_fc3 = tf.nn.tanh(h_fc3)
-        else:
-            tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, b_fc3)
-
-
-        return h_fc1, h_fc3

From 53c60575507cc13fd84746733e4c3810af4cd898 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 12:04:43 +1100
Subject: [PATCH 20/39] Add a missing _v2.

---
 tf/tfprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 25f32877..0e760e53 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -612,7 +612,7 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             self.save_leelaz_weights_v2(leela_path)
             print("Weights saved in file: {}".format(leela_path))
             if self.swa_enabled:
-                self.save_swa_weights(swa_path)
+                self.save_swa_weights_v2(swa_path)
                 print("SWA Weights saved in file: {}".format(swa_path))
 
     def process(self, batch_size, test_batches, batch_splits=1):

From 6892b1c8f0bca60e103c69465f6cbf717e0628d1 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 12:16:41 +1100
Subject: [PATCH 21/39] Fix bug in saving swa_weights that I hadn't tested
 before committing...

---
 tf/tfprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 0e760e53..0267790e 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -959,7 +959,7 @@ def save_swa_weights_v2(self, filename):
         backup = [w.read_value() for w in self.model.weights]
         for (swa, w) in zip(self.swa_weights, self.model.weights):
             w.assign(swa.read_value())
-        self.save_leelaz_weights_v2(self, filename)
+        self.save_leelaz_weights_v2(filename)
         for (old, w) in zip(backup, self.model.weights):
             w.assign(old)
 

From c6109361f50b7fb73a90536c8a7a1c223fa4a3c9 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 14:28:50 +1100
Subject: [PATCH 22/39] Add mixed precision support to tf2 version.

---
 tf/tfprocess.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 0267790e..ecc8c325 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -53,7 +53,7 @@ def __init__(self, **kwargs):
 
     def call(self, inputs):
         h_conv_pol_flat = tf.reshape(inputs, [-1, 80*8*8])
-        return tf.matmul(h_conv_pol_flat, self.fc1)
+        return tf.matmul(h_conv_pol_flat, tf.cast(self.fc1, h_conv_pol_flat.dtype))
 
 
 def bias_variable(shape, name=None, dtype=tf.float32):
@@ -135,6 +135,9 @@ def __init__(self, cfg):
         self.session = tf.compat.v1.Session(config=config)
         gpus = tf.config.experimental.list_physical_devices('GPU')
         tf.config.experimental.set_visible_devices(gpus[self.cfg['gpu']], 'GPU')
+        if self.model_dtype == tf.float16:
+            tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+
     
         self.global_step = tf.Variable(0, name='global_step', trainable=False, dtype=tf.int64)
 
@@ -145,7 +148,7 @@ def init_v2(self, train_dataset, test_dataset):
         self.test_dataset = test_dataset
         self.test_iter = iter(test_dataset)
         self.init_net_v2()
-        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model, global_step=self.global_step, swa_count=self.swa_count)
+        self.checkpoint = tf.train.Checkpoint(optimizer=self.orig_optimizer, model=self.model, global_step=self.global_step, swa_count=self.swa_count)
         self.checkpoint.listed = self.swa_weights
         self.manager = tf.train.CheckpointManager(
             self.checkpoint, directory=self.root_dir, max_to_keep=50, keep_checkpoint_every_n_hours=24)
@@ -181,7 +184,11 @@ def init_net_v2(self):
         self.active_lr = 0.01
         # TODO set up optimizers and loss functions.
         self.optimizer = tf.keras.optimizers.SGD(learning_rate=lambda: self.active_lr, momentum=0.9, nesterov=True)
-        def policy_loss(target, output):                
+        self.orig_optimizer = self.optimizer
+        if self.loss_scale != 1:
+            self.optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(self.optimizer, self.loss_scale)
+        def policy_loss(target, output):
+            output = tf.cast(output, tf.float32)
             # Calculate loss on policy head
             if self.cfg['training'].get('mask_legal_moves'):
                 # extract mask for legal moves from target policy
@@ -209,12 +216,14 @@ def policy_loss(target, output):
         # Loss on value head
         if self.wdl:
             def value_loss(target, output):
+                output = tf.cast(output, tf.float32)
                 value_cross_entropy = \
                     tf.nn.softmax_cross_entropy_with_logits(labels=tf.stop_gradient(target),
                                                     logits=output)
                 return tf.reduce_mean(input_tensor=value_cross_entropy)
             self.value_loss_fn = value_loss
             def mse_loss(target, output):
+                output = tf.cast(output, tf.float32)
                 scalar_z_conv = tf.matmul(tf.nn.softmax(output), wdl)
                 scalar_target = tf.matmul(target, wdl)
                 return tf.reduce_mean(input_tensor=tf.math.squared_difference(scalar_target, scalar_z_conv))
@@ -224,6 +233,7 @@ def value_loss(target, output):
                 return tf.constant(0)
             self.value_loss_fn = value_loss
             def mse_loss(target, output):
+                output = tf.cast(output, tf.float32)
                 scalar_target = tf.matmul(target, wdl)
                 return tf.reduce_mean(input_tensor=tf.math.squared_difference(scalar_target, output))
             self.mse_loss_fn = mse_loss
@@ -482,6 +492,8 @@ def process_inner_loop(self):
             else:
                 mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
                 total_loss = self.lossMix(policy_loss, mse_loss) + reg_term
+            if self.loss_scale != 1:
+                total_loss = self.optimizer.get_scaled_loss(total_loss)
         if self.wdl:
             mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
         else:
@@ -548,6 +560,8 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             self.avg_reg_term.append(reg_term)
         # Gradients of batch splits are summed, not averaged like usual, so need to scale lr accordingly to correct for this.        
         self.active_lr = self.lr / batch_splits
+        if self.loss_scale != 1:
+            grads = self.optimizer.get_unscaled_gradients(grads)
         max_grad_norm = self.cfg['training'].get('max_grad_norm', 10000.0) * batch_splits
         grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
         self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))

From 5a2a95b2a8ef37b815ee7b0a0d666fe48e43b122 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 15:00:05 +1100
Subject: [PATCH 23/39] Renorm also doesn't actually support fused.

---
 tf/tfprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index ecc8c325..b7dc3509 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -1107,7 +1107,7 @@ def batch_norm_v2(self, input, scale=False):
                 "dmax": self.renorm_max_d
                 }
             return tf.keras.layers.BatchNormalization(
-                epsilon=1e-5, axis=1, fused=True, center=True,
+                epsilon=1e-5, axis=1, fused=False, center=True,
                 scale=scale, renorm=True, renorm_clipping=clipping,
                 renorm_momentum=self.renorm_momentum)(input)
         else:

From 6a37e9f941fc249abe45bc6f154b19e830f3776a Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 19:42:38 +1100
Subject: [PATCH 24/39] Re-add accuracy reporting.

---
 tf/tfprocess.py | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index b7dc3509..6a976843 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -182,7 +182,6 @@ def init_net_v2(self):
             self.swa_weights = [tf.Variable(w, trainable=False) for w in self.model.weights]
         
         self.active_lr = 0.01
-        # TODO set up optimizers and loss functions.
         self.optimizer = tf.keras.optimizers.SGD(learning_rate=lambda: self.active_lr, momentum=0.9, nesterov=True)
         self.orig_optimizer = self.optimizer
         if self.loss_scale != 1:
@@ -241,7 +240,12 @@ def mse_loss(target, output):
         pol_loss_w = self.cfg['training']['policy_loss_weight']
         val_loss_w = self.cfg['training']['value_loss_weight']
         self.lossMix = lambda policy, value: pol_loss_w * policy + val_loss_w * value
-        
+
+        def accuracy(target, output):
+            output = tf.cast(output, tf.float32)
+            return tf.reduce_mean(tf.cast(tf.equal(tf.argmax(input=target, axis=1), tf.argmax(input=output, axis=1)), tf.float32))
+        self.accuracy_fn = accuracy
+       
         self.avg_policy_loss = []
         self.avg_value_loss = []
         self.avg_mse_loss = []
@@ -780,14 +784,17 @@ def calculate_test_summaries_inner_loop(self):
         print('tracing summaries inner loop!')
         x, y, z, q = next(self.test_iter)
         policy, value = self.model(x)
-        policy_loss = self.policy_loss_fn(y, policy)                    
+        policy_loss = self.policy_loss_fn(y, policy)
+        policy_accuracy = self.accuracy_fn(y, policy)
         if self.wdl:
             value_loss = self.value_loss_fn(self.qMix(z, q), value)
             mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
+            value_accuracy = self.accuracy_fn(self.qMix(z,q), value)
         else:
             value_loss = self.value_loss_fn(self.qMix(z, q), value)
             mse_loss = self.mse_loss_fn(self.qMix(z, q), value)
-        return policy_loss, value_loss, mse_loss
+            value_accuracy = tf.constant(0.)
+        return policy_loss, value_loss, mse_loss, policy_accuracy, value_accuracy
 
     def calculate_test_summaries_v2(self, test_batches, steps):
         sum_policy_accuracy = 0
@@ -796,41 +803,36 @@ def calculate_test_summaries_v2(self, test_batches, steps):
         sum_policy = 0
         sum_value = 0
         for _ in range(0, test_batches):
-            policy_loss, value_loss, mse_loss = self.calculate_test_summaries_inner_loop()
-            #sum_policy_accuracy += test_policy_accuracy
+            policy_loss, value_loss, mse_loss, policy_accuracy, value_accuracy = self.calculate_test_summaries_inner_loop()
+            sum_policy_accuracy += policy_accuracy
             sum_mse += mse_loss
             sum_policy += policy_loss
             if self.wdl:
-                #sum_value_accuracy += test_value_accuracy
+                sum_value_accuracy += value_accuracy
                 sum_value += value_loss
-        #sum_policy_accuracy /= test_batches
-        #sum_policy_accuracy *= 100
+        sum_policy_accuracy /= test_batches
+        sum_policy_accuracy *= 100
         sum_policy /= test_batches
         sum_value /= test_batches
-        #if self.wdl:
-            #sum_value_accuracy /= test_batches
-            #sum_value_accuracy *= 100
+        if self.wdl:
+            sum_value_accuracy /= test_batches
+            sum_value_accuracy *= 100
         # Additionally rescale to [0, 1] so divide by 4
         sum_mse /= (4.0 * test_batches)
         self.net.pb.training_params.learning_rate = self.lr
         self.net.pb.training_params.mse_loss = sum_mse
         self.net.pb.training_params.policy_loss = sum_policy
         # TODO store value and value accuracy in pb
-        #self.net.pb.training_params.accuracy = sum_policy_accuracy
+        self.net.pb.training_params.accuracy = sum_policy_accuracy
         with self.test_writer.as_default():
             tf.summary.scalar("Policy Loss", sum_policy, step=steps)
             tf.summary.scalar("Value Loss", sum_value, step=steps)
             tf.summary.scalar("MSE Loss", sum_mse, step=steps)
+            tf.summary.scalar("Policy Accuracy", sum_policy_accuracy, step=steps)
+            if self.wdl:
+                tf.summary.scalar("Value Accuracy", sum_value_accuracy, step=steps)
         self.test_writer.flush()
 
-        #if self.wdl:
-        #    test_summaries = tf.compat.v1.Summary(value=[
-        #        tf.compat.v1.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
-        #        tf.compat.v1.Summary.Value(tag="Value Accuracy", simple_value=sum_value_accuracy),
-        #else:
-        #    test_summaries = tf.compat.v1.Summary(value=[
-        #        tf.compat.v1.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
-        #        tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
         #test_summaries = tf.compat.v1.summary.merge(
         #    [test_summaries] + self.histograms).eval(session=self.session)
         #self.test_writer.add_summary(test_summaries, steps)

From f2f32a2fb4cebd0ae09706d4938d4a6c7bae5de7 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 19:52:44 +1100
Subject: [PATCH 25/39] Some more cleanup of pre-v2 code that is converted or
 close enough to no longer be useful, and a fix for swa summaries being
 written to the wrong file.

---
 tf/tfprocess.py | 454 +-----------------------------------------------
 1 file changed, 2 insertions(+), 452 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 6a976843..6b5173a2 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -116,9 +116,6 @@ def __init__(self, cfg):
 
         self.net.set_valueformat(self.VALUE_HEAD)
 
-        # For exporting
-        self.weights = []
-
         self.swa_enabled = self.cfg['training'].get('swa', False)
 
         # Limit momentum of SWA exponential average to 1 - 1/(swa_max_n + 1)
@@ -129,16 +126,11 @@ def __init__(self, cfg):
         self.renorm_max_d = self.cfg['training'].get('renorm_max_d', 0)
         self.renorm_momentum = self.cfg['training'].get('renorm_momentum', 0.99)
 
-        gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.90,
-                                    allow_growth=True, visible_device_list="{}".format(self.cfg['gpu']))
-        config = tf.compat.v1.ConfigProto(gpu_options=gpu_options)
-        self.session = tf.compat.v1.Session(config=config)
         gpus = tf.config.experimental.list_physical_devices('GPU')
         tf.config.experimental.set_visible_devices(gpus[self.cfg['gpu']], 'GPU')
         if self.model_dtype == tf.float16:
             tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
 
-    
         self.global_step = tf.Variable(0, name='global_step', trainable=False, dtype=tf.int64)
 
     def init_v2(self, train_dataset, test_dataset):
@@ -153,23 +145,6 @@ def init_v2(self, train_dataset, test_dataset):
         self.manager = tf.train.CheckpointManager(
             self.checkpoint, directory=self.root_dir, max_to_keep=50, keep_checkpoint_every_n_hours=24)
 
-    def init(self, dataset, train_iterator, test_iterator):
-        self.training = tf.compat.v1.placeholder(tf.bool)
-        self.learning_rate = tf.compat.v1.placeholder(tf.float32)
-        # TF variables
-        self.handle = tf.compat.v1.placeholder(tf.string, shape=[])
-        iterator = tf.compat.v1.data.Iterator.from_string_handle(
-            self.handle, tf.compat.v1.data.get_output_types(dataset), tf.compat.v1.data.get_output_shapes(dataset))
-        self.next_batch = iterator.get_next()
-        self.train_handle = self.session.run(train_iterator.string_handle())
-        self.test_handle = self.session.run(test_iterator.string_handle())
-        self.l2reg = tf.keras.regularizers.l2(l=0.5 * (0.0001))
-
-        # This forces trainable variables to be stored as fp32
-        with tf.compat.v1.variable_scope("fp32_storage",
-                custom_getter=float32_variable_storage_getter):
-            self.init_net(self.next_batch)
-
     def init_net_v2(self):
         input_var = tf.keras.Input(shape=(112, 8*8))
         x_planes = tf.keras.layers.Reshape([112, 8, 8])(input_var)
@@ -264,156 +239,6 @@ def accuracy(target, output):
             self.swa_writer = tf.summary.create_file_writer(
                 os.path.join(os.getcwd(), "leelalogs/{}-swa-test".format(self.cfg['name'])))
 
-    def init_net(self, next_batch):
-        self.x = next_batch[0]  # tf.placeholder(tf.float32, [None, 112, 8*8])
-        self.y_ = next_batch[1] # tf.placeholder(tf.float32, [None, 1858])
-        self.z_ = next_batch[2] # tf.placeholder(tf.float32, [None, 3])
-        self.q_ = next_batch[3] # tf.placeholder(tf.float32, [None, 3])
-        self.batch_norm_count = 0
-        self.y_conv, self.z_conv = self.construct_net(self.x)
-
-        if self.model_dtype != tf.float32:
-            self.y_conv = tf.cast(self.y_conv, tf.float32)
-            self.z_conv = tf.cast(self.z_conv, tf.float32)
-
-        # Calculate loss on policy head
-        if self.cfg['training'].get('mask_legal_moves'):
-            # extract mask for legal moves from target policy
-            move_is_legal = tf.greater_equal(self.y_, 0)
-            # replace logits of illegal moves with large negative value (so that it doesn't affect policy of legal moves) without gradient
-            illegal_filler = tf.zeros_like(self.y_conv) - 1.0e10
-            self.y_conv = tf.compat.v1.where_v2(move_is_legal, self.y_conv, illegal_filler)
-        # y_ still has -1 on illegal moves, flush them to 0
-        self.y_ = tf.nn.relu(self.y_)
-
-        policy_cross_entropy = \
-            tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels=tf.stop_gradient(self.y_),
-                                                    logits=self.y_conv)
-        self.policy_loss = tf.reduce_mean(input_tensor=policy_cross_entropy)
-
-        q_ratio = self.cfg['training'].get('q_ratio', 0)
-        assert 0 <= q_ratio <= 1
-        target = self.q_ * q_ratio + self.z_ * (1 - q_ratio)
-
-        # Linear conversion to scalar to compute MSE with, for comparison to old values
-        wdl = tf.expand_dims(tf.constant([1.0, 0.0, -1.0]), 1)
-        scalar_target = tf.matmul(target, wdl)
-
-        # Loss on value head
-        if self.wdl:
-            value_cross_entropy = \
-                tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels=tf.stop_gradient(target),
-                                                    logits=self.z_conv)
-            self.value_loss = tf.reduce_mean(input_tensor=value_cross_entropy)
-            scalar_z_conv = tf.matmul(tf.nn.softmax(self.z_conv), wdl)
-            self.mse_loss = \
-                tf.reduce_mean(input_tensor=tf.math.squared_difference(scalar_target, scalar_z_conv))
-        else:
-            self.value_loss = tf.constant(0)
-            self.mse_loss = \
-                tf.reduce_mean(input_tensor=tf.math.squared_difference(scalar_target, self.z_conv))
-
-        # Regularizer
-        reg_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
-        penalties = [self.l2reg(w) for w in reg_variables]
-        self.reg_term = tf.math.add_n(penalties)
-
-        if self.model_dtype != tf.float32:
-            self.reg_term = tf.cast(self.reg_term, tf.float32)
-
-        # For training from a (smaller) dataset of strong players, you will
-        # want to reduce the factor in front of self.mse_loss here.
-        pol_loss_w = self.cfg['training']['policy_loss_weight']
-        val_loss_w = self.cfg['training']['value_loss_weight']
-        if self.wdl:
-            value_loss = self.value_loss
-        else:
-            value_loss = self.mse_loss
-        loss = pol_loss_w * self.policy_loss + \
-            val_loss_w * value_loss + self.reg_term
-
-        # Set adaptive learning rate during training
-        self.cfg['training']['lr_boundaries'].sort()
-        self.warmup_steps = self.cfg['training'].get('warmup_steps', 0)
-        self.lr = self.cfg['training']['lr_values'][0]
-
-        # You need to change the learning rate here if you are training
-        # from a self-play training set, for example start with 0.005 instead.
-        opt_op = tf.compat.v1.train.MomentumOptimizer(
-            learning_rate=self.learning_rate, momentum=0.9, use_nesterov=True)
-
-        opt_op = LossScalingOptimizer(opt_op, scale=self.loss_scale)
-
-        # Do swa after we contruct the net
-        if self.swa_enabled:
-            # Count of networks accumulated into SWA
-            self.swa_count = tf.Variable(0., name='swa_count', trainable=False)
-            # Build the SWA variables and accumulators
-            accum = []
-            load = []
-            n = self.swa_count
-            for w in self.weights:
-                name = w.name.split(':')[0]
-                var = tf.Variable(
-                    tf.zeros(shape=w.shape), name='swa/'+name, trainable=False)
-                accum.append(
-                    tf.compat.v1.assign(var, var * (n / (n + 1.)) + tf.stop_gradient(w) * (1. / (n + 1.))))
-                load.append(tf.compat.v1.assign(w, var))
-            with tf.control_dependencies(accum):
-                self.swa_accum_op = tf.compat.v1.assign_add(n, 1.)
-            self.swa_load_op = tf.group(*load)
-
-        # Accumulate (possibly multiple) gradient updates to simulate larger batch sizes than can be held in GPU memory.
-        gradient_accum = [tf.Variable(tf.zeros_like(
-            var.initialized_value()), trainable=False) for var in tf.compat.v1.trainable_variables()]
-        self.zero_op = [var.assign(tf.zeros_like(var))
-                        for var in gradient_accum]
-
-        self.update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
-        with tf.control_dependencies(self.update_ops):
-            gradients = opt_op.compute_gradients(loss)
-        self.accum_op = [accum.assign_add(
-            gradient[0]) for accum, gradient in zip(gradient_accum, gradients)]
-        # gradients are num_batch_splits times higher due to accumulation by summing, so the norm will be too
-        max_grad_norm = self.cfg['training'].get(
-            'max_grad_norm', 10000.0) * self.cfg['training'].get('num_batch_splits', 1)
-        gradient_accum, self.grad_norm = tf.clip_by_global_norm(
-            gradient_accum, max_grad_norm)
-        self.train_op = opt_op.apply_gradients(
-            [(accum, gradient[1]) for accum, gradient in zip(gradient_accum, gradients)], global_step=self.global_step)
-
-        correct_policy_prediction = \
-            tf.equal(tf.argmax(input=self.y_conv, axis=1), tf.argmax(input=self.y_, axis=1))
-        correct_policy_prediction = tf.cast(correct_policy_prediction, tf.float32)
-        self.policy_accuracy = tf.reduce_mean(input_tensor=correct_policy_prediction)
-        correct_value_prediction = \
-            tf.equal(tf.argmax(input=self.z_conv, axis=1), tf.argmax(input=self.z_, axis=1))
-        correct_value_prediction = tf.cast(correct_value_prediction, tf.float32)
-        self.value_accuracy = tf.reduce_mean(input_tensor=correct_value_prediction)
-
-        self.avg_policy_loss = []
-        self.avg_value_loss = []
-        self.avg_mse_loss = []
-        self.avg_reg_term = []
-        self.time_start = None
-        self.last_steps = None
-
-        # Summary part
-        self.test_writer = tf.compat.v1.summary.FileWriter(
-            os.path.join(os.getcwd(), "leelalogs/{}-test".format(self.cfg['name'])))
-        self.train_writer = tf.compat.v1.summary.FileWriter(
-            os.path.join(os.getcwd(), "leelalogs/{}-train".format(self.cfg['name'])))
-        if self.swa_enabled:
-            self.swa_writer = tf.compat.v1.summary.FileWriter(
-                os.path.join(os.getcwd(), "leelalogs/{}-swa-test".format(self.cfg['name'])))
-        self.histograms = [tf.compat.v1.summary.histogram(
-            weight.name, weight) for weight in self.weights]
-
-        self.init = tf.compat.v1.global_variables_initializer()
-        self.saver = tf.compat.v1.train.Saver()
-
-        self.session.run(self.init)
-
     def replace_weights(self, new_weights):
         all_evals = []
         for e, weights in enumerate(self.weights):
@@ -462,10 +287,6 @@ def restore_v2(self):
             print("Restoring from {0}".format(self.manager.latest_checkpoint))
             self.checkpoint.restore(self.manager.latest_checkpoint)
 
-    def restore(self, file):
-        print("Restoring from {0}".format(file))
-        self.saver.restore(self.session, file)
-
     def process_loop_v2(self, batch_size, test_batches, batch_splits=1):
         # Get the initial steps value in case this is a resume from a step count
         # which is not a multiple of total_steps.
@@ -474,14 +295,6 @@ def process_loop_v2(self, batch_size, test_batches, batch_splits=1):
         for _ in range(steps % total_steps, total_steps):
             self.process_v2(batch_size, test_batches, batch_splits=batch_splits)
 
-    def process_loop(self, batch_size, test_batches, batch_splits=1):
-        # Get the initial steps value in case this is a resume from a step count
-        # which is not a multiple of total_steps.
-        steps = tf.compat.v1.train.global_step(self.session, self.global_step)
-        total_steps = self.cfg['training']['total_steps']
-        for _ in range(steps % total_steps, total_steps):
-            self.process(batch_size, test_batches, batch_splits=batch_splits)
-
     @tf.function()
     def process_inner_loop(self):
         print('tracing inner loop!')
@@ -633,152 +446,17 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
                 self.save_swa_weights_v2(swa_path)
                 print("SWA Weights saved in file: {}".format(swa_path))
 
-    def process(self, batch_size, test_batches, batch_splits=1):
-        if not self.time_start:
-            self.time_start = time.time()
-
-        # Get the initial steps value before we do a training step.
-        steps = tf.compat.v1.train.global_step(self.session, self.global_step)
-        if not self.last_steps:
-            self.last_steps = steps
-
-        if self.swa_enabled:
-            # split half of test_batches between testing regular weights and SWA weights
-            test_batches //= 2
-
-        # Run test before first step to see delta since end of last run.
-        if steps % self.cfg['training']['total_steps'] == 0:
-            # Steps is given as one higher than current in order to avoid it
-            # being equal to the value the end of a run is stored against.
-            self.calculate_test_summaries(test_batches, steps + 1)
-            if self.swa_enabled:
-                self.calculate_swa_summaries(test_batches, steps + 1)
-
-        # Make sure that ghost batch norm can be applied
-        if batch_size % 64 != 0:
-            # Adjust required batch size for batch splitting.
-            required_factor = 64 * \
-                self.cfg['training'].get('num_batch_splits', 1)
-            raise ValueError(
-                'batch_size must be a multiple of {}'.format(required_factor))
-
-        # Determine learning rate
-        lr_values = self.cfg['training']['lr_values']
-        lr_boundaries = self.cfg['training']['lr_boundaries']
-        steps_total = steps % self.cfg['training']['total_steps']
-        self.lr = lr_values[bisect.bisect_right(lr_boundaries, steps_total)]
-        if self.warmup_steps > 0 and steps < self.warmup_steps:
-            self.lr = self.lr * (steps + 1) / self.warmup_steps
-
-        # need to add 1 to steps because steps will be incremented after gradient update
-        if (steps + 1) % self.cfg['training']['train_avg_report_steps'] == 0 or (steps + 1) % self.cfg['training']['total_steps'] == 0:
-            before_weights = self.session.run(self.weights)
-
-        # Run training for this batch
-        self.session.run(self.zero_op)
-        for _ in range(batch_splits):
-            policy_loss, value_loss, mse_loss, reg_term, _, _ = self.session.run(
-                [self.policy_loss, self.value_loss, self.mse_loss, self.reg_term, self.accum_op,
-                    self.next_batch],
-                feed_dict={self.training: True, self.handle: self.train_handle})
-            # Keep running averages
-            # Google's paper scales MSE by 1/4 to a [0, 1] range, so do the same to
-            # get comparable values.
-            mse_loss /= 4.0
-            self.avg_policy_loss.append(policy_loss)
-            if self.wdl:
-                self.avg_value_loss.append(value_loss)
-            self.avg_mse_loss.append(mse_loss)
-            self.avg_reg_term.append(reg_term)
-        # Gradients of batch splits are summed, not averaged like usual, so need to scale lr accordingly to correct for this.
-        corrected_lr = self.lr / batch_splits
-        _, grad_norm = self.session.run([self.train_op, self.grad_norm],
-                                        feed_dict={self.learning_rate: corrected_lr, self.training: True, self.handle: self.train_handle})
-
-        # Update steps since training should have incremented it.
-        steps = tf.compat.v1.train.global_step(self.session, self.global_step)
-
-        if steps % self.cfg['training']['train_avg_report_steps'] == 0 or steps % self.cfg['training']['total_steps'] == 0:
-            pol_loss_w = self.cfg['training']['policy_loss_weight']
-            val_loss_w = self.cfg['training']['value_loss_weight']
-            time_end = time.time()
-            speed = 0
-            if self.time_start:
-                elapsed = time_end - self.time_start
-                steps_elapsed = steps - self.last_steps
-                speed = batch_size * (steps_elapsed / elapsed)
-            avg_policy_loss = np.mean(self.avg_policy_loss or [0])
-            avg_value_loss = np.mean(self.avg_value_loss or [0])
-            avg_mse_loss = np.mean(self.avg_mse_loss or [0])
-            avg_reg_term = np.mean(self.avg_reg_term or [0])
-            print("step {}, lr={:g} policy={:g} value={:g} mse={:g} reg={:g} total={:g} ({:g} pos/s)".format(
-                steps, self.lr, avg_policy_loss, avg_value_loss, avg_mse_loss, avg_reg_term,
-                pol_loss_w * avg_policy_loss + val_loss_w * avg_value_loss + avg_reg_term,
-                speed))
-
-            after_weights = self.session.run(self.weights)
-            update_ratio_summaries = self.compute_update_ratio(
-                before_weights, after_weights)
-
-            train_summaries = tf.compat.v1.Summary(value=[
-                tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=avg_policy_loss),
-                tf.compat.v1.Summary.Value(tag="Value Loss", simple_value=avg_value_loss),
-                tf.compat.v1.Summary.Value(tag="Reg term", simple_value=avg_reg_term),
-                tf.compat.v1.Summary.Value(tag="LR", simple_value=self.lr),
-                tf.compat.v1.Summary.Value(tag="Gradient norm",
-                                 simple_value=grad_norm / batch_splits),
-                tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=avg_mse_loss)])
-            self.train_writer.add_summary(train_summaries, steps)
-            self.train_writer.add_summary(update_ratio_summaries, steps)
-            self.time_start = time_end
-            self.last_steps = steps
-            self.avg_policy_loss, self.avg_value_loss, self.avg_mse_loss, self.avg_reg_term = [], [], [], []
-
-        if self.swa_enabled and steps % self.cfg['training']['swa_steps'] == 0:
-            self.update_swa()
-
-        # Calculate test values every 'test_steps', but also ensure there is
-        # one at the final step so the delta to the first step can be calculted.
-        if steps % self.cfg['training']['test_steps'] == 0 or steps % self.cfg['training']['total_steps'] == 0:
-            self.calculate_test_summaries(test_batches, steps)
-            if self.swa_enabled:
-                self.calculate_swa_summaries(test_batches, steps)
-
-        # Save session and weights at end, and also optionally every 'checkpoint_steps'.
-        if steps % self.cfg['training']['total_steps'] == 0 or (
-                'checkpoint_steps' in self.cfg['training'] and steps % self.cfg['training']['checkpoint_steps'] == 0):
-            path = os.path.join(self.root_dir, self.cfg['name'])
-            save_path = self.saver.save(self.session, path, global_step=steps)
-            print("Model saved in file: {}".format(save_path))
-            leela_path = path + "-" + str(steps)
-            swa_path = path + "-swa-" + str(steps)
-            self.net.pb.training_params.training_steps = steps
-            self.save_leelaz_weights(leela_path)
-            print("Weights saved in file: {}".format(leela_path))
-            if self.swa_enabled:
-                self.save_swa_weights(swa_path)
-                print("SWA Weights saved in file: {}".format(swa_path))
-
     def calculate_swa_summaries_v2(self, test_batches, steps):
         backup = [w.read_value() for w in self.model.weights]
         for (swa, w) in zip(self.swa_weights, self.model.weights):
             w.assign(swa.read_value())
-        #true_test_writer, self.test_writer = self.test_writer, self.swa_writer
+        true_test_writer, self.test_writer = self.test_writer, self.swa_writer
         print('swa', end=' ')
         self.calculate_test_summaries_v2(test_batches, steps)
-        #self.test_writer = true_test_writer
+        self.test_writer = true_test_writer
         for (old, w) in zip(backup, self.model.weights):
             w.assign(old)
 
-    def calculate_swa_summaries(self, test_batches, steps):
-        self.snap_save()
-        self.session.run(self.swa_load_op)
-        true_test_writer, self.test_writer = self.test_writer, self.swa_writer
-        print('swa', end=' ')
-        self.calculate_test_summaries(test_batches, steps)
-        self.test_writer = true_test_writer
-        self.snap_restore()
-
     @tf.function()
     def calculate_test_summaries_inner_loop(self):
         print('tracing summaries inner loop!')
@@ -839,56 +517,6 @@ def calculate_test_summaries_v2(self, test_batches, steps):
         print("step {}, policy={:g} value={:g} policy accuracy={:g}% value accuracy={:g}% mse={:g}".\
             format(steps, sum_policy, sum_value, sum_policy_accuracy, sum_value_accuracy, sum_mse))
 
-    def calculate_test_summaries(self, test_batches, steps):
-        sum_policy_accuracy = 0
-        sum_value_accuracy = 0
-        sum_mse = 0
-        sum_policy = 0
-        sum_value = 0
-        for _ in range(0, test_batches):
-            test_policy, test_value, test_policy_accuracy, test_value_accuracy, test_mse, _ = self.session.run(
-                [self.policy_loss, self.value_loss, self.policy_accuracy, self.value_accuracy, self.mse_loss,
-                 self.next_batch],
-                feed_dict={self.training: False,
-                           self.handle: self.test_handle})
-            sum_policy_accuracy += test_policy_accuracy
-            sum_mse += test_mse
-            sum_policy += test_policy
-            if self.wdl:
-                sum_value_accuracy += test_value_accuracy
-                sum_value += test_value
-        sum_policy_accuracy /= test_batches
-        sum_policy_accuracy *= 100
-        sum_policy /= test_batches
-        sum_value /= test_batches
-        if self.wdl:
-            sum_value_accuracy /= test_batches
-            sum_value_accuracy *= 100
-        # Additionally rescale to [0, 1] so divide by 4
-        sum_mse /= (4.0 * test_batches)
-        self.net.pb.training_params.learning_rate = self.lr
-        self.net.pb.training_params.mse_loss = sum_mse
-        self.net.pb.training_params.policy_loss = sum_policy
-        # TODO store value and value accuracy in pb
-        self.net.pb.training_params.accuracy = sum_policy_accuracy
-        if self.wdl:
-            test_summaries = tf.compat.v1.Summary(value=[
-                tf.compat.v1.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
-                tf.compat.v1.Summary.Value(tag="Value Accuracy", simple_value=sum_value_accuracy),
-                tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
-                tf.compat.v1.Summary.Value(tag="Value Loss", simple_value=sum_value),
-                tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=sum_mse)]).SerializeToString()
-        else:
-            test_summaries = tf.compat.v1.Summary(value=[
-                tf.compat.v1.Summary.Value(tag="Policy Accuracy", simple_value=sum_policy_accuracy),
-                tf.compat.v1.Summary.Value(tag="Policy Loss", simple_value=sum_policy),
-                tf.compat.v1.Summary.Value(tag="MSE Loss", simple_value=sum_mse)]).SerializeToString()
-        test_summaries = tf.compat.v1.summary.merge(
-            [test_summaries] + self.histograms).eval(session=self.session)
-        self.test_writer.add_summary(test_summaries, steps)
-        print("step {}, policy={:g} value={:g} policy accuracy={:g}% value accuracy={:g}% mse={:g}".\
-            format(steps, sum_policy, sum_value, sum_policy_accuracy, sum_value_accuracy, sum_mse))
-
     def compute_update_ratio(self, before_weights, after_weights):
         """Compute the ratio of gradient norm to weight norm.
 
@@ -945,32 +573,6 @@ def update_swa_v2(self):
             swa.assign(swa.read_value() * (num / (num + 1.)) + w.read_value() * (1. / (num + 1.)))
         self.swa_count.assign(min(num + 1., self.swa_max_n))
 
-    def update_swa(self):
-        # Add the current weight vars to the running average.
-        num = self.session.run(self.swa_accum_op)
-        num = min(num, self.swa_max_n)
-        self.swa_count.load(float(num), self.session)
-
-    def snap_save(self):
-        # Save a snapshot of all the variables in the current graph.
-        if not hasattr(self, 'snap_save_op'):
-            save_ops = []
-            rest_ops = []
-            for var in self.weights:
-                if isinstance(var, str):
-                    var = tf.compat.v1.get_default_graph().get_tensor_by_name(var)
-                name = var.name.split(':')[0]
-                v = tf.Variable(var, name='save/'+name, trainable=False)
-                save_ops.append(tf.compat.v1.assign(v, var))
-                rest_ops.append(tf.compat.v1.assign(var, v))
-            self.snap_save_op = tf.group(*save_ops)
-            self.snap_restore_op = tf.group(*rest_ops)
-        self.session.run(self.snap_save_op)
-
-    def snap_restore(self):
-        # Restore variables in the current graph from the snapshot.
-        self.session.run(self.snap_restore_op)
-
     def save_swa_weights_v2(self, filename):
         backup = [w.read_value() for w in self.model.weights]
         for (swa, w) in zip(self.swa_weights, self.model.weights):
@@ -979,12 +581,6 @@ def save_swa_weights_v2(self, filename):
         for (old, w) in zip(backup, self.model.weights):
             w.assign(old)
 
-    def save_swa_weights(self, filename):
-        self.snap_save()
-        self.session.run(self.swa_load_op)
-        self.save_leelaz_weights(filename)
-        self.snap_restore()
-
     def save_leelaz_weights_v2(self, filename):
         all_tensors = []
         all_weights = []
@@ -1055,52 +651,6 @@ def save_leelaz_weights_v2(self, filename):
         self.net.fill_net(all_weights)
         self.net.save_proto(filename)
 
-    def save_leelaz_weights(self, filename):
-        all_weights = []
-        if not hasattr(self, 'pb_save_op'):
-            all_evals = []
-            for weights in self.weights:
-                work_weights = None
-                if weights.shape.ndims == 4:
-                    # Convolution weights need a transpose
-                    #
-                    # TF (kYXInputOutput)
-                    # [filter_height, filter_width, in_channels, out_channels]
-                    #
-                    # Leela/cuDNN/Caffe (kOutputInputYX)
-                    # [output, input, filter_size, filter_size]
-                    work_weights = tf.transpose(a=weights, perm=[3, 2, 0, 1])
-                elif weights.shape.ndims == 2:
-                    # Fully connected layers are [in, out] in TF
-                    #
-                    # [out, in] in Leela
-                    #
-                    work_weights = tf.transpose(a=weights, perm=[1, 0])
-                else:
-                    # Biases, batchnorm etc
-                    work_weights = weights
-                all_evals.append(work_weights)
-            self.pb_save_op = all_evals
-        nparrays = self.session.run(self.pb_save_op)
-        for e, nparray in enumerate(nparrays):
-            # Rescale rule50 related weights as clients do not normalize the input.
-            if e == 0:
-                num_inputs = 112
-                # 50 move rule is the 110th input, or 109 starting from 0.
-                rule50_input = 109
-                wt_flt = []
-                for i, weight in enumerate(np.ravel(nparray)):
-                    if (i % (num_inputs*9))//9 == rule50_input:
-                        wt_flt.append(weight/99)
-                    else:
-                        wt_flt.append(weight)
-            else:
-                wt_flt = [wt for wt in np.ravel(nparray)]
-            all_weights.append(wt_flt)
-
-        self.net.fill_net(all_weights)
-        self.net.save_proto(filename)
-
     def batch_norm_v2(self, input, scale=False):
         if self.renorm_enabled:
             clipping = {

From 7cbe7754c522424bf6cafac5bc7ccb86a1470932 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 20:50:53 +1100
Subject: [PATCH 26/39] Readd basic update ratios.

---
 tf/tfprocess.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 6b5173a2..542d5444 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -355,8 +355,9 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             self.lr = self.lr * tf.cast(steps + 1, tf.float32) / self.warmup_steps
 
         # need to add 1 to steps because steps will be incremented after gradient update
-        #if (steps + 1) % self.cfg['training']['train_avg_report_steps'] == 0 or (steps + 1) % self.cfg['training']['total_steps'] == 0:
-        #    before_weights = self.session.run(self.weights)
+        if (steps + 1) % self.cfg['training']['train_avg_report_steps'] == 0 or (steps + 1) % self.cfg['training']['total_steps'] == 0:
+            before_weights = [w.read_value() for w in self.model.weights]
+
 
         # Run training for this batch
         grads = None
@@ -405,9 +406,7 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
                 pol_loss_w * avg_policy_loss + val_loss_w * avg_value_loss + avg_reg_term,
                 speed))
 
-            #after_weights = self.session.run(self.weights)
-            #update_ratio_summaries = self.compute_update_ratio(
-            #    before_weights, after_weights)
+            after_weights = [w.read_value() for w in self.model.weights]
             with self.train_writer.as_default():
                 tf.summary.scalar("Policy Loss", avg_policy_loss, step=steps)
                 tf.summary.scalar("Value Loss", avg_value_loss, step=steps)
@@ -415,8 +414,9 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
                 tf.summary.scalar("LR", self.lr, step=steps)
                 tf.summary.scalar("Gradient norm", grad_norm / batch_splits, step=steps)
                 tf.summary.scalar("MSE Loss", avg_mse_loss, step=steps)
+                self.compute_update_ratio_v2(
+                    before_weights, after_weights, steps)
             self.train_writer.flush()
-            #self.train_writer.add_summary(update_ratio_summaries, steps)
             self.time_start = time_end
             self.last_steps = steps
             self.avg_policy_loss, self.avg_value_loss, self.avg_mse_loss, self.avg_reg_term = [], [], [], []
@@ -517,6 +517,22 @@ def calculate_test_summaries_v2(self, test_batches, steps):
         print("step {}, policy={:g} value={:g} policy accuracy={:g}% value accuracy={:g}% mse={:g}".\
             format(steps, sum_policy, sum_value, sum_policy_accuracy, sum_value_accuracy, sum_mse))
 
+    def compute_update_ratio_v2(self, before_weights, after_weights, steps):
+        """Compute the ratio of gradient norm to weight norm.
+
+        Adapted from https://github.com/tensorflow/minigo/blob/c923cd5b11f7d417c9541ad61414bf175a84dc31/dual_net.py#L567
+        """
+        deltas = [after - before for after,
+                  before in zip(after_weights, before_weights)]
+        delta_norms = [np.linalg.norm(d.numpy().ravel()) for d in deltas]
+        weight_norms = [np.linalg.norm(w.numpy().ravel()) for w in before_weights]
+        ratios = [(tensor.name, d / w) for d, w, tensor in zip(delta_norms, weight_norms, self.model.weights) if not 'moving' in tensor.name and w != 0.]
+        for name, ratio in ratios:
+            tf.summary.scalar('update_ratios/' + name, ratio, step=steps)
+        #ratios = np.log10([r for (_, r) in ratios if 0 < r < np.inf])
+        #all_summaries.append(self.log_histogram('update_ratios_log10', ratios))
+        #return tf.compat.v1.Summary(value=all_summaries)
+
     def compute_update_ratio(self, before_weights, after_weights):
         """Compute the ratio of gradient norm to weight norm.
 

From 47208df63494f6f5d1d62261d838605a99bddcc8 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 21:02:19 +1100
Subject: [PATCH 27/39] Small performance optimization to offset the cost of
 update ratios.

---
 tf/tfprocess.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 542d5444..6214cf2a 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -295,6 +295,10 @@ def process_loop_v2(self, batch_size, test_batches, batch_splits=1):
         for _ in range(steps % total_steps, total_steps):
             self.process_v2(batch_size, test_batches, batch_splits=batch_splits)
 
+    @tf.function()
+    def read_weights(self):
+        return [w.read_value() for w in self.model.weights]
+
     @tf.function()
     def process_inner_loop(self):
         print('tracing inner loop!')
@@ -356,7 +360,7 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
 
         # need to add 1 to steps because steps will be incremented after gradient update
         if (steps + 1) % self.cfg['training']['train_avg_report_steps'] == 0 or (steps + 1) % self.cfg['training']['total_steps'] == 0:
-            before_weights = [w.read_value() for w in self.model.weights]
+            before_weights = self.read_weights()
 
 
         # Run training for this batch
@@ -406,7 +410,7 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
                 pol_loss_w * avg_policy_loss + val_loss_w * avg_value_loss + avg_reg_term,
                 speed))
 
-            after_weights = [w.read_value() for w in self.model.weights]
+            after_weights = self.read_weights()
             with self.train_writer.as_default():
                 tf.summary.scalar("Policy Loss", avg_policy_loss, step=steps)
                 tf.summary.scalar("Value Loss", avg_value_loss, step=steps)
@@ -447,7 +451,7 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
                 print("SWA Weights saved in file: {}".format(swa_path))
 
     def calculate_swa_summaries_v2(self, test_batches, steps):
-        backup = [w.read_value() for w in self.model.weights]
+        backup = self.read_weights()
         for (swa, w) in zip(self.swa_weights, self.model.weights):
             w.assign(swa.read_value())
         true_test_writer, self.test_writer = self.test_writer, self.swa_writer
@@ -590,7 +594,7 @@ def update_swa_v2(self):
         self.swa_count.assign(min(num + 1., self.swa_max_n))
 
     def save_swa_weights_v2(self, filename):
-        backup = [w.read_value() for w in self.model.weights]
+        backup = self.read_weights()
         for (swa, w) in zip(self.swa_weights, self.model.weights):
             w.assign(swa.read_value())
         self.save_leelaz_weights_v2(filename)

From d22b9b8f1d77231b2c4954073d37332a1a369b20 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Wed, 11 Dec 2019 23:53:39 +1100
Subject: [PATCH 28/39] Optimize compute update ratio since adding
 update_ratio_log10 had a noticeable performance impact.

---
 tf/tfprocess.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 6214cf2a..0c5da63f 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -521,6 +521,7 @@ def calculate_test_summaries_v2(self, test_batches, steps):
         print("step {}, policy={:g} value={:g} policy accuracy={:g}% value accuracy={:g}% mse={:g}".\
             format(steps, sum_policy, sum_value, sum_policy_accuracy, sum_value_accuracy, sum_mse))
 
+    @tf.function()
     def compute_update_ratio_v2(self, before_weights, after_weights, steps):
         """Compute the ratio of gradient norm to weight norm.
 
@@ -528,14 +529,14 @@ def compute_update_ratio_v2(self, before_weights, after_weights, steps):
         """
         deltas = [after - before for after,
                   before in zip(after_weights, before_weights)]
-        delta_norms = [np.linalg.norm(d.numpy().ravel()) for d in deltas]
-        weight_norms = [np.linalg.norm(w.numpy().ravel()) for w in before_weights]
-        ratios = [(tensor.name, d / w) for d, w, tensor in zip(delta_norms, weight_norms, self.model.weights) if not 'moving' in tensor.name and w != 0.]
+        delta_norms = [tf.math.reduce_euclidean_norm(d) for d in deltas]
+        weight_norms = [tf.math.reduce_euclidean_norm(w) for w in before_weights]
+        ratios = [(tensor.name, tf.cond(w != 0., lambda: d / w, lambda: -1.)) for d, w, tensor in zip(delta_norms, weight_norms, self.model.weights) if not 'moving' in tensor.name]
         for name, ratio in ratios:
             tf.summary.scalar('update_ratios/' + name, ratio, step=steps)
-        #ratios = np.log10([r for (_, r) in ratios if 0 < r < np.inf])
-        #all_summaries.append(self.log_histogram('update_ratios_log10', ratios))
-        #return tf.compat.v1.Summary(value=all_summaries)
+        # Filtering is hard, so just push infinities/NaNs to an unreasonably large value.
+        ratios = [tf.cond(r > 0, lambda: tf.math.log(r) / 2.30258509299, lambda: 200.) for (_, r) in ratios]
+        tf.summary.histogram('update_ratios_log10', tf.stack(ratios), buckets=1000, step=steps)
 
     def compute_update_ratio(self, before_weights, after_weights):
         """Compute the ratio of gradient norm to weight norm.

From ef0636550c9238ed5fc5495f7c277fd5b9a17722 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Thu, 12 Dec 2019 14:45:36 +1100
Subject: [PATCH 29/39] Some more cleanup, add weight histograms back.

---
 tf/tfprocess.py | 63 ++-----------------------------------------------
 1 file changed, 2 insertions(+), 61 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 0c5da63f..3bcae0ff 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -55,14 +55,6 @@ def call(self, inputs):
         h_conv_pol_flat = tf.reshape(inputs, [-1, 80*8*8])
         return tf.matmul(h_conv_pol_flat, tf.cast(self.fc1, h_conv_pol_flat.dtype))
 
-
-def bias_variable(shape, name=None, dtype=tf.float32):
-    return tf.Variable(tf.compat.v1.zeros_initializer()(shape, dtype), name=name)
-
-def conv2d(x, W):
-    return tf.nn.conv2d(input=x, filters=W, data_format='NCHW',
-                        strides=[1, 1, 1, 1], padding='SAME')
-
 class TFProcess:
     def __init__(self, cfg):
         self.cfg = cfg
@@ -513,11 +505,10 @@ def calculate_test_summaries_v2(self, test_batches, steps):
             tf.summary.scalar("Policy Accuracy", sum_policy_accuracy, step=steps)
             if self.wdl:
                 tf.summary.scalar("Value Accuracy", sum_value_accuracy, step=steps)
+            for w in self.model.weights:
+                tf.summary.histogram(w.name, w, buckets=1000, step=steps)
         self.test_writer.flush()
 
-        #test_summaries = tf.compat.v1.summary.merge(
-        #    [test_summaries] + self.histograms).eval(session=self.session)
-        #self.test_writer.add_summary(test_summaries, steps)
         print("step {}, policy={:g} value={:g} policy accuracy={:g}% value accuracy={:g}% mse={:g}".\
             format(steps, sum_policy, sum_value, sum_policy_accuracy, sum_value_accuracy, sum_mse))
 
@@ -538,56 +529,6 @@ def compute_update_ratio_v2(self, before_weights, after_weights, steps):
         ratios = [tf.cond(r > 0, lambda: tf.math.log(r) / 2.30258509299, lambda: 200.) for (_, r) in ratios]
         tf.summary.histogram('update_ratios_log10', tf.stack(ratios), buckets=1000, step=steps)
 
-    def compute_update_ratio(self, before_weights, after_weights):
-        """Compute the ratio of gradient norm to weight norm.
-
-        Adapted from https://github.com/tensorflow/minigo/blob/c923cd5b11f7d417c9541ad61414bf175a84dc31/dual_net.py#L567
-        """
-        deltas = [after - before for after,
-                  before in zip(after_weights, before_weights)]
-        delta_norms = [np.linalg.norm(d.ravel()) for d in deltas]
-        weight_norms = [np.linalg.norm(w.ravel()) for w in before_weights]
-        ratios = [(tensor.name, d / w) for d, w, tensor in zip(delta_norms, weight_norms, self.weights) if not 'moving' in tensor.name]
-        all_summaries = [
-            tf.compat.v1.Summary.Value(tag='update_ratios/' +
-                             name, simple_value=ratio)
-            for name, ratio in ratios]
-        ratios = np.log10([r for (_, r) in ratios if 0 < r < np.inf])
-        all_summaries.append(self.log_histogram('update_ratios_log10', ratios))
-        return tf.compat.v1.Summary(value=all_summaries)
-
-    def log_histogram(self, tag, values, bins=1000):
-        """Logs the histogram of a list/vector of values.
-
-        From https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514
-        """
-        # Convert to a numpy array
-        values = np.array(values)
-
-        # Create histogram using numpy
-        counts, bin_edges = np.histogram(values, bins=bins)
-
-        # Fill fields of histogram proto
-        hist = tf.compat.v1.HistogramProto()
-        hist.min = float(np.min(values))
-        hist.max = float(np.max(values))
-        hist.num = int(np.prod(values.shape))
-        hist.sum = float(np.sum(values))
-        hist.sum_squares = float(np.sum(values**2))
-
-        # Requires equal number as bins, where the first goes from -DBL_MAX to bin_edges[1]
-        # See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto#L30
-        # Thus, we drop the start of the first bin
-        bin_edges = bin_edges[1:]
-
-        # Add bin edges and counts
-        for edge in bin_edges:
-            hist.bucket_limit.append(edge)
-        for c in counts:
-            hist.bucket.append(c)
-
-        return tf.compat.v1.Summary.Value(tag=tag, histo=hist)
-
     def update_swa_v2(self):
         num = self.swa_count.read_value()
         for (w, swa) in zip(self.model.weights, self.swa_weights):

From 753b91897beca3ac362d86f1d78993feda8c3103 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Thu, 12 Dec 2019 16:05:51 +1100
Subject: [PATCH 30/39] Fix net saving for renorm mode.

---
 tf/tfprocess.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 3bcae0ff..1491c7af 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -565,6 +565,16 @@ def save_leelaz_weights_v2(self, filename):
                 #
                 work_weights = tf.transpose(a=weights, perm=[1, 0])
             else:
+                # batch renorm has extra weights, but we don't know what to do with them.
+                if 'renorm' in weights.name:
+                    continue
+                # renorm has variance, but it is not the primary source of truth
+                if 'variance:' in weights.name and self.renorm_enabled:
+                    continue
+                # Renorm has moving stddev not variance, undo the transform to make it compatible.
+                if 'stddev:' in weights.name:
+                    all_tensors.append(tf.math.square(weights) - 1e-5)
+                    continue
                 # Biases, batchnorm etc
                 # pb expects every batch norm to have gammas, but not all of our
                 # batch norms have gammas, so manually add pretend gammas.

From 72f7cb30d4844b006b35dd0f7b6934eab3c24a87 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Thu, 12 Dec 2019 17:32:55 +1100
Subject: [PATCH 31/39] Add net_to_model and update_steps support. Remove more
 unsupported code, including upgrade.py which doesn't have an obvious
 transition.

---
 tf/mixprec.py      | 48 ------------------------
 tf/net_to_model.py | 19 +++-------
 tf/tfprocess.py    | 92 ++++++++++++++++++++++++++++++++++++----------
 tf/update_steps.py | 21 ++---------
 tf/upgrade.py      | 73 ------------------------------------
 5 files changed, 81 insertions(+), 172 deletions(-)
 delete mode 100644 tf/mixprec.py
 delete mode 100644 tf/upgrade.py

diff --git a/tf/mixprec.py b/tf/mixprec.py
deleted file mode 100644
index cf161ac0..00000000
--- a/tf/mixprec.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import tensorflow as tf
-
-
-def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
-                                    initializer=None, regularizer=None,
-                                    trainable=True,
-                                    *args, **kwargs):
-    """Custom variable getter that forces trainable variables to be stored in
-    float32 precision and then casts them to the training precision."""
-    storage_dtype = tf.float32 if trainable else dtype
-    variable = getter(name, shape, dtype=storage_dtype,
-                      initializer=initializer,
-                      regularizer=regularizer,
-                      trainable=trainable,
-                      *args, **kwargs)
-    if trainable and dtype != tf.float32:
-        cast_name = name + '/fp16_cast'
-        try:
-            cast_variable = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                cast_name + ':0')
-        except KeyError:
-            cast_variable = tf.cast(variable, dtype, name=cast_name)
-        cast_variable._ref = variable._ref
-        variable = cast_variable
-    return variable
-
-
-class LossScalingOptimizer(tf.compat.v1.train.Optimizer):
-    """An optimizer that scales loss and un-scales gradients."""
-
-    def __init__(self, optimizer,
-                 scale=None,
-                 name="LossScalingOptimizer",
-                 use_locking=False):
-        super(LossScalingOptimizer, self).__init__(
-            name=name, use_locking=use_locking)
-        self._optimizer = optimizer
-        self._scale = float(scale) if scale is not None else 1.0
-
-    def compute_gradients(self, loss, var_list=None, *args, **kwargs):
-        if self._scale != 1.0:
-            loss = tf.scalar_mul(self._scale, loss)
-        gradvar = self._optimizer.compute_gradients(loss, var_list, *args, **kwargs)
-        gradvar = [(tf.scalar_mul(1. / self._scale, g), v) for g, v in gradvar]
-        return gradvar
-
-    def apply_gradients(self, *args, **kwargs):
-        return self._optimizer.apply_gradients(*args, **kwargs)
diff --git a/tf/net_to_model.py b/tf/net_to_model.py
index 8d3190d6..1d3059b0 100755
--- a/tf/net_to_model.py
+++ b/tf/net_to_model.py
@@ -27,22 +27,13 @@
     raise ValueError("Number of blocks in YAML doesn't match the network")
 weights = net.get_weights()
 
-x = [
-    tf.compat.v1.placeholder(tf.float32, [None, 112, 8*8]),
-    tf.compat.v1.placeholder(tf.float32, [None, 1858]),
-    tf.compat.v1.placeholder(tf.float32, [None, 3]),
-    tf.compat.v1.placeholder(tf.float32, [None, 3]),
-    ]
-
 tfp = tfprocess.TFProcess(cfg)
-tfp.init_net(x)
-tfp.replace_weights(weights)
-update_global_step = tfp.global_step.assign(START_FROM)
-tfp.session.run(update_global_step)
+tfp.init_net_v2()
+tfp.replace_weights_v2(weights)
+tfp.global_step.assign(START_FROM)
 
 root_dir = os.path.join(cfg['training']['path'], cfg['name'])
 if not os.path.exists(root_dir):
     os.makedirs(root_dir)
-path = os.path.join(root_dir, cfg['name'])
-save_path = tfp.saver.save(tfp.session, path, global_step=START_FROM)
-print("Wrote model to {}".format(root_dir))
+tfp.manager.save()
+print("Wrote model to {}".format(tfp.manager.latest_checkpoint))
diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 1491c7af..1d9a44ea 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -24,7 +24,6 @@
 import bisect
 import lc0_az_policy_map
 import proto.net_pb2 as pb
-from mixprec import float32_variable_storage_getter, LossScalingOptimizer
 
 from net import Net
 
@@ -126,18 +125,14 @@ def __init__(self, cfg):
         self.global_step = tf.Variable(0, name='global_step', trainable=False, dtype=tf.int64)
 
     def init_v2(self, train_dataset, test_dataset):
-        self.l2reg = tf.keras.regularizers.l2(l=0.5 * (0.0001))
         self.train_dataset = train_dataset
         self.train_iter = iter(train_dataset)
         self.test_dataset = test_dataset
         self.test_iter = iter(test_dataset)
         self.init_net_v2()
-        self.checkpoint = tf.train.Checkpoint(optimizer=self.orig_optimizer, model=self.model, global_step=self.global_step, swa_count=self.swa_count)
-        self.checkpoint.listed = self.swa_weights
-        self.manager = tf.train.CheckpointManager(
-            self.checkpoint, directory=self.root_dir, max_to_keep=50, keep_checkpoint_every_n_hours=24)
 
     def init_net_v2(self):
+        self.l2reg = tf.keras.regularizers.l2(l=0.5 * (0.0001))
         input_var = tf.keras.Input(shape=(112, 8*8))
         x_planes = tf.keras.layers.Reshape([112, 8, 8])(input_var)
         self.model = tf.keras.Model(inputs=input_var, outputs=self.construct_net_v2(x_planes))
@@ -230,19 +225,58 @@ def accuracy(target, output):
         if self.swa_enabled:
             self.swa_writer = tf.summary.create_file_writer(
                 os.path.join(os.getcwd(), "leelalogs/{}-swa-test".format(self.cfg['name'])))
+        self.checkpoint = tf.train.Checkpoint(optimizer=self.orig_optimizer, model=self.model, global_step=self.global_step, swa_count=self.swa_count)
+        self.checkpoint.listed = self.swa_weights
+        self.manager = tf.train.CheckpointManager(
+            self.checkpoint, directory=self.root_dir, max_to_keep=50, keep_checkpoint_every_n_hours=24)
+
+    def replace_weights_v2(self, new_weights_orig):
+        new_weights = [w for w in new_weights_orig]
+        # self.model.weights ordering doesn't match up nicely, so first shuffle the new weights to match up.
+        # input order is (for convolutional policy):
+        # policy conv
+        # policy bn * 4
+        # policy raw conv and bias
+        # value conv
+        # value bn * 4
+        # value dense with bias
+        # value dense with bias
+        #
+        # output order is (for convolutional policy):
+        # value conv
+        # policy conv
+        # value bn * 4
+        # policy bn * 4
+        # policy raw conv and bias
+        # value dense with bias
+        # value dense with bias
+        new_weights[-5] = new_weights_orig[-10]
+        new_weights[-6] = new_weights_orig[-11]
+        new_weights[-7] = new_weights_orig[-12]
+        new_weights[-8] = new_weights_orig[-13]
+        new_weights[-9] = new_weights_orig[-14]
+        new_weights[-10] = new_weights_orig[-15]
+        new_weights[-11] = new_weights_orig[-5]
+        new_weights[-12] = new_weights_orig[-6]
+        new_weights[-13] = new_weights_orig[-7]
+        new_weights[-14] = new_weights_orig[-8]
+        new_weights[-15] = new_weights_orig[-16]
+        new_weights[-16] = new_weights_orig[-9]
 
-    def replace_weights(self, new_weights):
         all_evals = []
-        for e, weights in enumerate(self.weights):
+        offset = 0
+        last_was_gamma = False
+        for e, weights in enumerate(self.model.weights):
+            source_idx = e+offset
             if weights.shape.ndims == 4:
                 # Rescale rule50 related weights as clients do not normalize the input.
                 if e == 0:
                     num_inputs = 112
                     # 50 move rule is the 110th input, or 109 starting from 0.
                     rule50_input = 109
-                    for i in range(len(new_weights[e])):
+                    for i in range(len(new_weights[source_idx])):
                         if (i % (num_inputs*9))//9 == rule50_input:
-                            new_weights[e][i] = new_weights[e][i]*99
+                            new_weights[source_idx][i] = new_weights[source_idx][i]*99
 
                 # Convolution weights need a transpose
                 #
@@ -253,9 +287,9 @@ def replace_weights(self, new_weights):
                 # [output, input, filter_size, filter_size]
                 s = weights.shape.as_list()
                 shape = [s[i] for i in [3, 2, 0, 1]]
-                new_weight = tf.constant(new_weights[e], shape=shape)
-                all_evals.append(weights.assign(
-                    tf.transpose(a=new_weight, perm=[2, 3, 1, 0])))
+                new_weight = tf.constant(new_weights[source_idx], shape=shape)
+                weights.assign(
+                    tf.transpose(a=new_weight, perm=[2, 3, 1, 0]))
             elif weights.shape.ndims == 2:
                 # Fully connected layers are [in, out] in TF
                 #
@@ -263,16 +297,34 @@ def replace_weights(self, new_weights):
                 #
                 s = weights.shape.as_list()
                 shape = [s[i] for i in [1, 0]]
-                new_weight = tf.constant(new_weights[e], shape=shape)
-                all_evals.append(weights.assign(
-                    tf.transpose(a=new_weight, perm=[1, 0])))
+                new_weight = tf.constant(new_weights[source_idx], shape=shape)
+                weights.assign(
+                    tf.transpose(a=new_weight, perm=[1, 0]))
             else:
+                # Can't populate renorm weights, but the current new_weight will need using elsewhere.
+                if 'renorm' in weights.name:
+                    offset-=1
+                    continue
+                # betas without gamms need to skip the gamma in the input.
+                if 'beta:' in weights.name and not last_was_gamma:
+                    source_idx+=1
+                    offset+=1
                 # Biases, batchnorm etc
-                new_weight = tf.constant(new_weights[e], shape=weights.shape)
-                all_evals.append(tf.compat.v1.assign(weights, new_weight))
-        self.session.run(all_evals)
+                new_weight = tf.constant(new_weights[source_idx], shape=weights.shape)
+                if 'stddev:' in weights.name:
+                    weights.assign(tf.math.sqrt(new_weight + 1e-5))
+                else:
+                    weights.assign(new_weight)
+                # need to use the variance to also populate the stddev for renorm, so adjust offset.
+                if 'variance:' in weights.name and self.renorm_enabled:
+                    offset-=1
+            last_was_gamma = 'gamma:' in weights.name
+        # Replace the SWA weights as well, ensuring swa accumulation is reset.
+        if self.swa_enabled:
+            self.swa_count.assign(tf.constant(0.))
+            self.update_swa_v2()
         # This should result in identical file to the starting one
-        # self.save_leelaz_weights('restored.txt')
+        # self.save_leelaz_weights_v2('restored.pb.gz')
 
     def restore_v2(self):
         if self.manager.latest_checkpoint is not None:
diff --git a/tf/update_steps.py b/tf/update_steps.py
index 49f357f2..f4740bce 100644
--- a/tf/update_steps.py
+++ b/tf/update_steps.py
@@ -16,28 +16,15 @@ def main(cmd):
     if not os.path.exists(root_dir):
         os.makedirs(root_dir)
 
-    x = [
-        tf.compat.v1.placeholder(tf.float32, [None, 112, 8*8]),
-        tf.compat.v1.placeholder(tf.float32, [None, 1858]),
-        tf.compat.v1.placeholder(tf.float32, [None, 3]),
-        tf.compat.v1.placeholder(tf.float32, [None, 3]),
-    ]
-
     tfprocess = TFProcess(cfg)
-    tfprocess.init_net(x)
+    tfprocess.init_net_v2()
 
-    if os.path.exists(os.path.join(root_dir, 'checkpoint')):
-        cp = tf.train.latest_checkpoint(root_dir)
-        tfprocess.restore(cp)
+    tfprocess.restore_v2()
 
     START_FROM = cmd.start
 
-    update_global_step = tfprocess.global_step.assign(START_FROM)
-    tfprocess.session.run(update_global_step)
-    path = os.path.join(root_dir, cfg['name'])
-    save_path = tfprocess.saver.save(tfprocess.session, path, global_step=START_FROM)
-
-    tfprocess.session.close()
+    tfprocess.global_step.assign(START_FROM)
+    tfprocess.manager.save()
 
 if __name__ == "__main__":
     argparser = argparse.ArgumentParser(description=\
diff --git a/tf/upgrade.py b/tf/upgrade.py
deleted file mode 100644
index 49e3c11d..00000000
--- a/tf/upgrade.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import os
-import yaml
-import sys
-import tensorflow as tf
-from tfprocess import TFProcess
-
-START_FROM = 0
-
-def main(cmd):
-    cfg = yaml.safe_load(cmd.cfg.read())
-    print(yaml.dump(cfg, default_flow_style=False))
-
-    root_dir = os.path.join(cfg['training']['path'], cfg['name'])
-    if not os.path.exists(root_dir):
-        os.makedirs(root_dir)
-
-    x = [
-        tf.compat.v1.placeholder(tf.float32, [None, 112, 8*8]),
-        tf.compat.v1.placeholder(tf.float32, [None, 1858]),
-        tf.compat.v1.placeholder(tf.float32, [None, 3]),
-        tf.compat.v1.placeholder(tf.float32, [None, 3]),
-    ]
-
-    tfprocess = TFProcess(cfg)
-    tfprocess.init_net(x)
-
-    if os.path.exists(os.path.join(root_dir, 'checkpoint')):
-        cp = tf.train.latest_checkpoint(root_dir)
-        reader = tf.compat.v1.train.NewCheckpointReader(cp)
-        saved_shapes = reader.get_variable_to_shape_map()
-        new_names = sorted(
-            [var.name.split(':')[0] for var in tf.compat.v1.global_variables()
-             if var.name.split(':')[0] not in saved_shapes])
-        for saved_var_name in new_names:
-            print("New name {} will use default value".format(saved_var_name))
-        var_names = sorted(
-            [(var.name, var.name.split(':')[0]) for var in tf.compat.v1.global_variables()
-             if var.name.split(':')[0] in saved_shapes])
-        restore_vars = []
-        restore_names = []
-        for var_name, saved_var_name in var_names:
-            curr_var = tf.compat.v1.get_default_graph().get_tensor_by_name(var_name)
-            var_shape = curr_var.get_shape().as_list()
-            if var_shape == saved_shapes[saved_var_name]:
-                restore_vars.append(curr_var)
-                restore_names.append(saved_var_name)
-            else:
-                print("Dropping {} due to shape change".format(saved_var_name))
-        legacy_names = sorted(
-            [name for name in saved_shapes.keys()
-             if name not in restore_names])
-        for saved_var_name in legacy_names:
-            print("Dropping {} as no longer used".format(saved_var_name))
-        opt_saver = tf.compat.v1.train.Saver(restore_vars)
-        opt_saver.restore(tfprocess.session, cp)
-    else:
-        print("No checkpoint to upgrade!")
-        exit(1)
-
-    steps = tf.compat.v1.train.global_step(tfprocess.session, tfprocess.global_step)
-    path = os.path.join(root_dir, cfg['name'])
-    save_path = tfprocess.saver.save(tfprocess.session, path, global_step=steps)
-    tfprocess.session.close()
-
-if __name__ == "__main__":
-    argparser = argparse.ArgumentParser(description=\
-    'Convert current checkpoint to new training script or incompatible training parameters.')
-    argparser.add_argument('--cfg', type=argparse.FileType('r'),
-        help='yaml configuration with training parameters')
-
-    main(argparser.parse_args())

From 9f7defc733c29a29240660c14ac2ffe37d5b969f Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Thu, 12 Dec 2019 20:01:06 +1100
Subject: [PATCH 32/39] Some more cleanup.

---
 tf/train.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tf/train.py b/tf/train.py
index a4c98462..f15b7230 100755
--- a/tf/train.py
+++ b/tf/train.py
@@ -125,7 +125,6 @@ def main(cmd):
         train_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string))
     train_dataset = train_dataset.map(ChunkParser.parse_function)
     train_dataset = train_dataset.prefetch(4)
-    #train_iterator = tf.compat.v1.data.make_one_shot_iterator(train_dataset)
 
     shuffle_size = int(shuffle_size*(1.0-train_ratio))
     test_parser = ChunkParser(FileDataSrc(test_chunks),
@@ -134,14 +133,9 @@ def main(cmd):
         test_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string))
     test_dataset = test_dataset.map(ChunkParser.parse_function)
     test_dataset = test_dataset.prefetch(4)
-    #test_iterator = tf.compat.v1.data.make_one_shot_iterator(test_dataset)
 
-    #tfprocess.init(test_dataset, train_iterator, test_iterator)
     tfprocess.init_v2(train_dataset, test_dataset)
 
-    #if os.path.exists(os.path.join(root_dir, 'checkpoint')):
-    #    cp = tf.train.latest_checkpoint(root_dir)
-    #    tfprocess.restore(cp)
     tfprocess.restore_v2()
 
     # If number of test positions is not given
@@ -153,7 +147,6 @@ def main(cmd):
     num_evals = max(1, num_evals // ChunkParser.BATCH_SIZE)
     print("Using {} evaluation batches".format(num_evals))
 
-    #tfprocess.process_loop(total_batch_size, num_evals, batch_splits=batch_splits)
     tfprocess.process_loop_v2(total_batch_size, num_evals, batch_splits=batch_splits)
 
     if cmd.output is not None:
@@ -162,7 +155,6 @@ def main(cmd):
         else:
             tfprocess.save_leelaz_weights_v2(cmd.output)
 
-    #tfprocess.session.close()
     train_parser.shutdown()
     test_parser.shutdown()
 

From 7823f0d3e5c57f1e11af703733ff8d9f48978c01 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Thu, 12 Dec 2019 23:21:46 +1100
Subject: [PATCH 33/39] More cleanup.

---
 tf/tfprocess.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 1d9a44ea..377f40ea 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -27,10 +27,6 @@
 
 from net import Net
 
-# Bias weights for layers not followed by BatchNorm
-# We do not regularlize biases, so they are not
-# added to the regularlizer collection
-
 class ApplySqueezeExcitation(tf.keras.layers.Layer):
     def __init__(self, **kwargs):
         super(ApplySqueezeExcitation, self).__init__(**kwargs)

From 5f4d8c44de7ebe4929e8c700dae90afb1cde5428 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Thu, 12 Dec 2019 23:23:21 +1100
Subject: [PATCH 34/39] Small fix needed to make script work in ubuntu.

---
 tf/tfprocess.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index 377f40ea..ce24fdd8 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -115,6 +115,7 @@ def __init__(self, cfg):
 
         gpus = tf.config.experimental.list_physical_devices('GPU')
         tf.config.experimental.set_visible_devices(gpus[self.cfg['gpu']], 'GPU')
+        tf.config.experimental.set_memory_growth(gpus[self.cfg['gpu']], True)
         if self.model_dtype == tf.float16:
             tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
 

From da94d7635f85bca0ba46f3a9460d719a42225293 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Fri, 13 Dec 2019 08:00:40 +1100
Subject: [PATCH 35/39] explicitly add the training flags to model calls - as
 apparently needed.

---
 tf/tfprocess.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index ce24fdd8..d5c5f2e1 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -345,7 +345,7 @@ def process_inner_loop(self):
         print('tracing inner loop!')
         x, y, z, q = next(self.train_iter)
         with tf.GradientTape() as tape:
-            policy, value = self.model(x)
+            policy, value = self.model(x, training=True)
             policy_loss = self.policy_loss_fn(y, policy)                    
             reg_term = sum(self.model.losses)
             if self.wdl:
@@ -506,7 +506,7 @@ def calculate_swa_summaries_v2(self, test_batches, steps):
     def calculate_test_summaries_inner_loop(self):
         print('tracing summaries inner loop!')
         x, y, z, q = next(self.test_iter)
-        policy, value = self.model(x)
+        policy, value = self.model(x, training=False)
         policy_loss = self.policy_loss_fn(y, policy)
         policy_accuracy = self.accuracy_fn(y, policy)
         if self.wdl:

From 20fe10a72e053648ae7da119c9545c090225b797 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Fri, 13 Dec 2019 08:18:57 +1100
Subject: [PATCH 36/39] More cleanup.

---
 tf/tfprocess.py | 20 +++++++++-----------
 tf/train.py     |  1 -
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index d5c5f2e1..ef141294 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -133,11 +133,11 @@ def init_net_v2(self):
         input_var = tf.keras.Input(shape=(112, 8*8))
         x_planes = tf.keras.layers.Reshape([112, 8, 8])(input_var)
         self.model = tf.keras.Model(inputs=input_var, outputs=self.construct_net_v2(x_planes))
-        self.swa_count = None
+        # swa_count initialized reguardless to make checkpoint code simpler.
+        self.swa_count = tf.Variable(0., name='swa_count', trainable=False)
         self.swa_weights = None
         if self.swa_enabled:
             # Count of networks accumulated into SWA
-            self.swa_count = tf.Variable(0., name='swa_count', trainable=False)
             self.swa_weights = [tf.Variable(w, trainable=False) for w in self.model.weights]
         
         self.active_lr = 0.01
@@ -342,7 +342,6 @@ def read_weights(self):
 
     @tf.function()
     def process_inner_loop(self):
-        print('tracing inner loop!')
         x, y, z, q = next(self.train_iter)
         with tf.GradientTape() as tape:
             policy, value = self.model(x, training=True)
@@ -504,7 +503,6 @@ def calculate_swa_summaries_v2(self, test_batches, steps):
 
     @tf.function()
     def calculate_test_summaries_inner_loop(self):
-        print('tracing summaries inner loop!')
         x, y, z, q = next(self.test_iter)
         policy, value = self.model(x, training=False)
         policy_loss = self.policy_loss_fn(y, policy)
@@ -645,14 +643,14 @@ def save_leelaz_weights_v2(self, filename):
         permuted_tensors[-8] = all_tensors[-14]
         permuted_tensors[-9] = all_tensors[-16]
         permuted_tensors[-10] = all_tensors[-5]
-        permuted_tensors[-11] = all_tensors[-6]        
-        permuted_tensors[-12] = all_tensors[-7]        
-        permuted_tensors[-13] = all_tensors[-8]        
-        permuted_tensors[-14] = all_tensors[-9]        
-        permuted_tensors[-15] = all_tensors[-10]        
-        permuted_tensors[-16] = all_tensors[-15]        
+        permuted_tensors[-11] = all_tensors[-6]
+        permuted_tensors[-12] = all_tensors[-7]
+        permuted_tensors[-13] = all_tensors[-8]
+        permuted_tensors[-14] = all_tensors[-9]
+        permuted_tensors[-15] = all_tensors[-10]
+        permuted_tensors[-16] = all_tensors[-15]
         all_tensors = permuted_tensors
-        
+
         for e, nparray in enumerate(all_tensors):
             # Rescale rule50 related weights as clients do not normalize the input.
             if e == 0:
diff --git a/tf/train.py b/tf/train.py
index f15b7230..ee76b390 100755
--- a/tf/train.py
+++ b/tf/train.py
@@ -25,7 +25,6 @@
 import random
 import multiprocessing as mp
 import tensorflow as tf
-#tf.compat.v1.disable_v2_behavior()
 from tfprocess import TFProcess
 from chunkparser import ChunkParser
 

From 3ac88fa7aa0926c0d9a6e405e22e4919a107cd74 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Fri, 13 Dec 2019 16:44:53 +1100
Subject: [PATCH 37/39] Pull iterator advancement out of the tf.function loops,
 its not supposed to work there.

Even if it was working just fine... this seems slightly faster.
---
 tf/tfprocess.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index ef141294..a923feb9 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -341,8 +341,7 @@ def read_weights(self):
         return [w.read_value() for w in self.model.weights]
 
     @tf.function()
-    def process_inner_loop(self):
-        x, y, z, q = next(self.train_iter)
+    def process_inner_loop(self, x, y, z, q):
         with tf.GradientTape() as tape:
             policy, value = self.model(x, training=True)
             policy_loss = self.policy_loss_fn(y, policy)                    
@@ -406,7 +405,8 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
         # Run training for this batch
         grads = None
         for _ in range(batch_splits):
-            policy_loss, value_loss, mse_loss, reg_term, new_grads = self.process_inner_loop()
+            x, y, z, q = next(self.train_iter)
+            policy_loss, value_loss, mse_loss, reg_term, new_grads = self.process_inner_loop(x, y, z, q)
             if not grads:
                 grads = new_grads
             else:
@@ -502,8 +502,7 @@ def calculate_swa_summaries_v2(self, test_batches, steps):
             w.assign(old)
 
     @tf.function()
-    def calculate_test_summaries_inner_loop(self):
-        x, y, z, q = next(self.test_iter)
+    def calculate_test_summaries_inner_loop(self, x, y, z, q):
         policy, value = self.model(x, training=False)
         policy_loss = self.policy_loss_fn(y, policy)
         policy_accuracy = self.accuracy_fn(y, policy)
@@ -524,7 +523,8 @@ def calculate_test_summaries_v2(self, test_batches, steps):
         sum_policy = 0
         sum_value = 0
         for _ in range(0, test_batches):
-            policy_loss, value_loss, mse_loss, policy_accuracy, value_accuracy = self.calculate_test_summaries_inner_loop()
+            x, y, z, q = next(self.test_iter)
+            policy_loss, value_loss, mse_loss, policy_accuracy, value_accuracy = self.calculate_test_summaries_inner_loop(x, y, z, q)
             sum_policy_accuracy += policy_accuracy
             sum_mse += mse_loss
             sum_policy += policy_loss

From 681ab7429f8411cc8a9e339cd3bcdbb00ae162f9 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Fri, 13 Dec 2019 17:43:28 +1100
Subject: [PATCH 38/39] Add experimental dataset loader.

---
 tf/train.py | 74 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 12 deletions(-)

diff --git a/tf/train.py b/tf/train.py
index ee76b390..d452f03c 100755
--- a/tf/train.py
+++ b/tf/train.py
@@ -83,6 +83,43 @@ def next(self):
             except:
                 print("failed to parse {}".format(filename))
 
+def extract_inputs_outputs(raw):
+    # first 4 bytes in each batch entry are boring.
+    # Next 7432 are easy, policy extraction.
+    policy = tf.io.decode_raw(tf.strings.substr(raw, 4, 7432), tf.float32)
+    # Next are 104 bit packed chess boards, they have to be expanded.
+    bit_planes = tf.expand_dims(tf.reshape(tf.io.decode_raw(tf.strings.substr(raw, 7436, 832), tf.uint8), [-1, 104, 8]), -1)
+    bit_planes = tf.bitwise.bitwise_and(tf.tile(bit_planes, [1, 1, 1, 8]), [128, 64, 32, 16, 8, 4, 2, 1])
+    bit_planes = tf.minimum(1., tf.cast(bit_planes, tf.float32))
+    # Next 5 planes are 1 or 0 to indicate 8x8 of 1 or 0.
+    unit_planes = tf.expand_dims(tf.expand_dims(tf.io.decode_raw(tf.strings.substr(raw, 8268, 5), tf.uint8), -1), -1)
+    unit_planes = tf.cast(tf.tile(unit_planes, [1, 1, 8, 8]), tf.float32)
+    # rule50 count plane.
+    rule50_plane = tf.expand_dims(tf.expand_dims(tf.io.decode_raw(tf.strings.substr(raw, 8273, 1), tf.uint8), -1), -1)
+    rule50_plane = tf.cast(tf.tile(rule50_plane, [1, 1, 8, 8]), tf.float32)
+    rule50_plane = tf.divide(rule50_plane, 99.)
+    # zero plane and one plane
+    zero_plane = tf.zeros_like(rule50_plane)
+    one_plane = tf.ones_like(rule50_plane)    
+    inputs = tf.reshape(tf.concat([bit_planes, unit_planes, rule50_plane, zero_plane, one_plane], 1), [-1, 112, 64])
+
+    # winner is stored in one signed byte and needs to be converted to one hot.
+    winner = tf.cast(tf.io.decode_raw(tf.strings.substr(raw, 8275, 1), tf.int8), tf.float32)
+    winner = tf.tile(winner, [1,3])
+    z = tf.cast(tf.equal(winner, [1., 0., -1.]), tf.float32)
+
+    # Outcome distribution needs to be calculated from q and d.
+    best_q = tf.io.decode_raw(tf.strings.substr(raw, 8280, 4), tf.float32)
+    best_d = tf.io.decode_raw(tf.strings.substr(raw, 8288, 4), tf.float32)
+    best_q_w = 0.5 * (1.0 - best_d + best_q)
+    best_q_l = 0.5 * (1.0 - best_d - best_q)
+
+    q = tf.concat([best_q_w, best_d, best_q_l], 1)
+    
+    return (inputs, policy, z, q)
+    
+def sample(x):
+    return tf.math.equal(tf.random.uniform([], 0, SKIP-1, dtype=tf.int32), 0)
 
 def main(cmd):
     cfg = yaml.safe_load(cmd.cfg.read())
@@ -91,6 +128,7 @@ def main(cmd):
     num_chunks = cfg['dataset']['num_chunks']
     allow_less = cfg['dataset'].get('allow_less_chunks', False)
     train_ratio = cfg['dataset']['train_ratio']
+    experimental_parser = cfg['dataset'].get('experimental_v4_only_dataset', False)
     num_train = int(num_chunks*train_ratio)
     num_test = num_chunks - num_train
     if 'input_test' in cfg['dataset']:
@@ -118,20 +156,32 @@ def main(cmd):
         os.makedirs(root_dir)
     tfprocess = TFProcess(cfg)
 
-    train_parser = ChunkParser(FileDataSrc(train_chunks),
-            shuffle_size=shuffle_size, sample=SKIP, batch_size=ChunkParser.BATCH_SIZE)
-    train_dataset = tf.data.Dataset.from_generator(
-        train_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string))
-    train_dataset = train_dataset.map(ChunkParser.parse_function)
-    train_dataset = train_dataset.prefetch(4)
+    if experimental_parser:
+        train_dataset = tf.data.Dataset.from_tensor_slices(train_chunks).shuffle(len(train_chunks)).repeat()\
+                         .interleave(lambda x: tf.data.FixedLengthRecordDataset(x, 8292, compression_type='GZIP', num_parallel_reads=1).filter(sample), num_parallel_calls=tf.data.experimental.AUTOTUNE)\
+                         .shuffle(shuffle_size)\
+                         .batch(split_batch_size).map(extract_inputs_outputs).prefetch(4)
+    else:
+        train_parser = ChunkParser(FileDataSrc(train_chunks),
+                shuffle_size=shuffle_size, sample=SKIP, batch_size=ChunkParser.BATCH_SIZE)
+        train_dataset = tf.data.Dataset.from_generator(
+            train_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string))
+        train_dataset = train_dataset.map(ChunkParser.parse_function)
+        train_dataset = train_dataset.prefetch(4)
 
     shuffle_size = int(shuffle_size*(1.0-train_ratio))
-    test_parser = ChunkParser(FileDataSrc(test_chunks),
-            shuffle_size=shuffle_size, sample=SKIP, batch_size=ChunkParser.BATCH_SIZE)
-    test_dataset = tf.data.Dataset.from_generator(
-        test_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string))
-    test_dataset = test_dataset.map(ChunkParser.parse_function)
-    test_dataset = test_dataset.prefetch(4)
+    if experimental_parser:
+        test_dataset = tf.data.Dataset.from_tensor_slices(test_chunks).shuffle(len(test_chunks)).repeat()\
+                         .interleave(lambda x: tf.data.FixedLengthRecordDataset(x, 8292, compression_type='GZIP', num_parallel_reads=1).filter(sample), num_parallel_calls=tf.data.experimental.AUTOTUNE)\
+                         .shuffle(shuffle_size)\
+                         .batch(split_batch_size).map(extract_inputs_outputs).prefetch(4)
+    else:
+        test_parser = ChunkParser(FileDataSrc(test_chunks),
+                shuffle_size=shuffle_size, sample=SKIP, batch_size=ChunkParser.BATCH_SIZE)
+        test_dataset = tf.data.Dataset.from_generator(
+            test_parser.parse, output_types=(tf.string, tf.string, tf.string, tf.string))
+        test_dataset = test_dataset.map(ChunkParser.parse_function)
+        test_dataset = test_dataset.prefetch(4)
 
     tfprocess.init_v2(train_dataset, test_dataset)
 

From 121545f93feae224293db8fad32b5752691c0bc2 Mon Sep 17 00:00:00 2001
From: Tilps <Tilps@users.noreply.github.com>
Date: Fri, 13 Dec 2019 20:28:16 +1100
Subject: [PATCH 39/39] Some micro-optimizations.

---
 tf/tfprocess.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/tf/tfprocess.py b/tf/tfprocess.py
index a923feb9..cc9d516e 100644
--- a/tf/tfprocess.py
+++ b/tf/tfprocess.py
@@ -360,6 +360,10 @@ def process_inner_loop(self, x, y, z, q):
             value_loss = self.value_loss_fn(self.qMix(z, q), value)
         return policy_loss, value_loss, mse_loss, reg_term, tape.gradient(total_loss, self.model.trainable_weights)
 
+    @tf.function()
+    def add_lists(self, x, y):
+        return [tf.math.add(a, b) for (a, b) in zip(x, y)]
+
     def process_v2(self, batch_size, test_batches, batch_splits=1):
         if not self.time_start:
             self.time_start = time.time()
@@ -410,7 +414,7 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
             if not grads:
                 grads = new_grads
             else:
-                grads = [tf.math.add(a, b) for (a, b) in zip(grads, new_grads)]
+                grads = self.add_lists(grads, new_grads)
             # Keep running averages
             # Google's paper scales MSE by 1/4 to a [0, 1] range, so do the same to
             # get comparable values.
@@ -490,16 +494,25 @@ def process_v2(self, batch_size, test_batches, batch_splits=1):
                 self.save_swa_weights_v2(swa_path)
                 print("SWA Weights saved in file: {}".format(swa_path))
 
-    def calculate_swa_summaries_v2(self, test_batches, steps):
+    @tf.function()
+    def switch_to_swa(self):
         backup = self.read_weights()
         for (swa, w) in zip(self.swa_weights, self.model.weights):
             w.assign(swa.read_value())
+        return backup
+
+    @tf.function()
+    def restore_weights(self, backup):
+        for (old, w) in zip(backup, self.model.weights):
+            w.assign(old)        
+
+    def calculate_swa_summaries_v2(self, test_batches, steps):
+        backup = self.switch_to_swa()
         true_test_writer, self.test_writer = self.test_writer, self.swa_writer
         print('swa', end=' ')
         self.calculate_test_summaries_v2(test_batches, steps)
         self.test_writer = true_test_writer
-        for (old, w) in zip(backup, self.model.weights):
-            w.assign(old)
+        self.restore_weights(backup)
 
     @tf.function()
     def calculate_test_summaries_inner_loop(self, x, y, z, q):
@@ -576,19 +589,17 @@ def compute_update_ratio_v2(self, before_weights, after_weights, steps):
         ratios = [tf.cond(r > 0, lambda: tf.math.log(r) / 2.30258509299, lambda: 200.) for (_, r) in ratios]
         tf.summary.histogram('update_ratios_log10', tf.stack(ratios), buckets=1000, step=steps)
 
+    @tf.function()
     def update_swa_v2(self):
         num = self.swa_count.read_value()
         for (w, swa) in zip(self.model.weights, self.swa_weights):
             swa.assign(swa.read_value() * (num / (num + 1.)) + w.read_value() * (1. / (num + 1.)))
-        self.swa_count.assign(min(num + 1., self.swa_max_n))
+        self.swa_count.assign(tf.math.minimum(num + 1., self.swa_max_n))
 
     def save_swa_weights_v2(self, filename):
-        backup = self.read_weights()
-        for (swa, w) in zip(self.swa_weights, self.model.weights):
-            w.assign(swa.read_value())
+        backup = self.switch_to_swa()
         self.save_leelaz_weights_v2(filename)
-        for (old, w) in zip(backup, self.model.weights):
-            w.assign(old)
+        self.restore_weights(backup)
 
     def save_leelaz_weights_v2(self, filename):
         all_tensors = []