Merge pull request LeelaChessZero#3 from mooskagh/master

Merge commits from last week
borg323 · Jun 1, 2018 · 42a9a61 · 42a9a61
2 parents 5054269 + baf7a86
commit 42a9a61
Show file tree

Hide file tree

Showing 9 changed files with 767 additions and 71 deletions.
diff --git a/COPYING b/COPYING
diff --git a/meson.build b/meson.build
@@ -158,7 +158,7 @@ if get_option('build_backends')
   if has_blas
 
     blas_files = [
-    'src/neural/transforms.cc',
+    'src/neural/CL/transforms.cc',
     'src/neural/network_blas.cc'
     ]
 

diff --git a/src/mcts/search.cc b/src/mcts/search.cc
@@ -46,7 +46,9 @@ const char* Search::kFpuReductionStr = "First Play Urgency Reduction";
 const char* Search::kCacheHistoryLengthStr =
     "Length of history to include in cache";
 const char* Search::kExtraVirtualLossStr = "Extra virtual loss";
-const char* Search::KPolicySoftmaxTempStr = "Policy softmax temperature";
+const char* Search::kPolicySoftmaxTempStr = "Policy softmax temperature";
+const char* Search::kAllowedNodeCollisionsStr =
+    "Allowed node collisions, per batch";
 
 namespace {
 const int kSmartPruningToleranceNodes = 100;
@@ -70,7 +72,10 @@ void Search::PopulateUciParams(OptionsParser* options) {
                           "cache-history-length") = 7;
   options->Add<FloatOption>(kExtraVirtualLossStr, 0.0, 100.0,
                             "extra-virtual-loss") = 0.0f;
-  options->Add<FloatOption>(KPolicySoftmaxTempStr, 0.1, 10.0, "policy-softmax-temp") = 1.0f;
+  options->Add<FloatOption>(kPolicySoftmaxTempStr, 0.1, 10.0,
+                            "policy-softmax-temp") = 1.0f;
+  options->Add<IntOption>(kAllowedNodeCollisionsStr, 0, 1024,
+                          "allowed-node-collisions") = 32;
 }
 
 Search::Search(const NodeTree& tree, Network* network,
@@ -98,7 +103,8 @@ Search::Search(const NodeTree& tree, Network* network,
       kFpuReduction(options.Get<float>(kFpuReductionStr)),
       kCacheHistoryLength(options.Get<int>(kCacheHistoryLengthStr)),
       kExtraVirtualLoss(options.Get<float>(kExtraVirtualLossStr)),
-      KPolicySoftmaxTemp(options.Get<float>(KPolicySoftmaxTempStr)) {}
+      kPolicySoftmaxTemp(options.Get<float>(kPolicySoftmaxTempStr)),
+      kAllowedNodeCollisions(options.Get<int>(kAllowedNodeCollisionsStr)) {}
 
 // Returns whether node was already in cache.
 bool Search::AddNodeToCompute(Node* node, CachingComputation* computation,
@@ -160,22 +166,40 @@ void ApplyDirichletNoise(Node* node, float eps, double alpha) {
 }  // namespace
 
 void Search::Worker() {
+  // Nodes to be extended/update counters.
   std::vector<Node*> nodes_to_process;
+  // Nodes, for which collision happened. For those only n_in_flight have to
+  // be rolled back.
+  std::vector<Node*> node_collisions;
   PositionHistory history(played_history_);
 
   // Exit check is at the end of the loop as at least one iteration is
   // necessary.
   while (true) {
     nodes_to_process.clear();
+    node_collisions.clear();
     auto computation = CachingComputation(network_->NewComputation(), cache_);
 
     // Gather nodes to process in the current batch.
-    for (int i = 0; i < kMiniBatchSize; ++i) {
+    while (static_cast<int>(nodes_to_process.size()) < kMiniBatchSize) {
       // Initialize position sequence with pre-move position.
       history.Trim(played_history_.GetLength());
       // If there's something to do without touching slow neural net, do it.
-      if (i > 0 && computation.GetCacheMisses() == 0) break;
-      Node* node = PickNodeToExtend(root_node_, &history);
+      if (!nodes_to_process.empty() && computation.GetCacheMisses() == 0) break;
+      // Returns <Node, whether it's computable> pair. The node is not
+      // computable if there is collision.
+      auto node_and_computable = PickNodeToExtend(root_node_, &history);
+      Node* node = node_and_computable.first;
+      const bool computable = node_and_computable.second;
+
+      // If there is collision, add to a vector to undo the virtual loss later.
+      if (!computable) {
+        node_collisions.emplace_back(node);
+        if (static_cast<int>(node_collisions.size()) > kAllowedNodeCollisions)
+          break;
+        continue;
+      }
+
       // If we hit the node that is already processed (by our batch or in
       // another thread) stop gathering and process smaller batch.
       if (!node) break;
@@ -194,6 +218,7 @@ void Search::Worker() {
       }
     }
 
+    // TODO(mooskagh) Remove prefetch into cache if node collisions work well.
     // If there are requests to NN, but the batch is not full, try to prefetch
     // nodes which are likely useful in future.
     if (computation.GetCacheMisses() > 0 &&
@@ -219,8 +244,8 @@ void Search::Worker() {
         for (Node* n : node->Children()) {
           float p = computation.GetPVal(idx_in_computation,
                                         n->GetMove().as_nn_index());
-          if(KPolicySoftmaxTemp != 1.0f){
-              p = pow(p, 1/KPolicySoftmaxTemp);
+          if (kPolicySoftmaxTemp != 1.0f) {
+            p = pow(p, 1 / kPolicySoftmaxTemp);
           }
           total += p;
           n->SetP(p);
@@ -270,6 +295,14 @@ void Search::Worker() {
         }
       }
       total_playouts_ += nodes_to_process.size();
+
+      // Remove virtual loss from node collisions.
+      for (Node* node : node_collisions) {
+        for (node = node->GetParent(); node != root_node_->GetParent();
+             node = node->GetParent()) {
+          node->CancelScoreUpdate();
+        }
+      }
     }
     UpdateRemainingMoves();  // Update remaining moves using smart pruning.
     MaybeOutputInfo();
@@ -375,8 +408,8 @@ Node* GetBestChild(Node* parent) {
   //   * If that number is larger than 0, the one wil larger eval wins.
   std::tuple<int, float, float> best(-1, 0.0, 0.0);
   for (Node* node : parent->Children()) {
-    std::tuple<int, float, float> val(node->GetNStarted(),
-                                      node->GetQ(-10.0, 0.0), node->GetP());
+    std::tuple<int, float, float> val(node->GetN(), node->GetQ(-10.0, 0.0),
+                                      node->GetP());
     if (val > best) {
       best = val;
       best_node = node;
@@ -391,7 +424,7 @@ Node* GetBestChildWithTemperature(Node* parent, float temperature) {
   const float n_parent = parent->GetN();
 
   for (Node* node : parent->Children()) {
-    sum += std::pow(node->GetNStarted() / n_parent, 1 / temperature);
+    sum += std::pow(node->GetN() / n_parent, 1 / temperature);
     cumulative_sums.push_back(sum);
   }
 
@@ -459,9 +492,8 @@ void Search::SendMovesStats() const {
   for (Node* iter : root_node_->Children()) {
     nodes.emplace_back(iter);
   }
-  std::sort(nodes.begin(), nodes.end(), [](const Node* a, const Node* b) {
-    return a->GetNStarted() < b->GetNStarted();
-  });
+  std::sort(nodes.begin(), nodes.end(),
+            [](const Node* a, const Node* b) { return a->GetN() < b->GetN(); });
 
   const bool is_black_to_move = played_history_.IsBlackToMove();
   ThinkingInfo info;
@@ -610,12 +642,15 @@ void Search::ExtendNode(Node* node, const PositionHistory& history) {
   for (const auto& move : legal_moves) node->CreateChild(move);
 }
 
-Node* Search::PickNodeToExtend(Node* node, PositionHistory* history) {
+// Returns node and whether it should be processed.
+// (false if it is a collision).
+std::pair<Node*, bool> Search::PickNodeToExtend(Node* node,
+                                                PositionHistory* history) {
   // Fetch the current best root node visits for possible smart pruning.
   int best_node_n = 0;
   {
     SharedMutex::Lock lock(nodes_mutex_);
-    if (best_move_node_) best_node_n = best_move_node_->GetNStarted();
+    if (best_move_node_) best_node_n = best_move_node_->GetN();
   }
 
   // True on first iteration, false as we dive deeper.
@@ -624,17 +659,9 @@ Node* Search::PickNodeToExtend(Node* node, PositionHistory* history) {
     {
       SharedMutex::Lock lock(nodes_mutex_);
       // Check whether we are in the leave.
-      if (!node->TryStartScoreUpdate()) {
-        // The node is currently being processed by another thread.
-        // Undo the increments of anschestor nodes, and return null.
-        for (node = node->GetParent(); node != root_node_->GetParent();
-             node = node->GetParent()) {
-          node->CancelScoreUpdate();
-        }
-        return nullptr;
-      }
+      if (!node->TryStartScoreUpdate()) return {node, false};
       // Found leave, and we are the the first to visit it.
-      if (!node->HasChildren()) return node;
+      if (!node->HasChildren()) return {node, true};
     }
 
     // Now we are not in leave, we need to go deeper.
@@ -655,7 +682,8 @@ Node* Search::PickNodeToExtend(Node* node, PositionHistory* history) {
         // To ensure we have at least one node to expand, always include
         // current best node.
         if (iter != best_move_node_ &&
-            remaining_playouts_ < best_node_n - iter->GetNStarted()) {
+            remaining_playouts_ <
+                best_node_n - static_cast<int>(iter->GetN())) {
           continue;
         }
         ++possible_moves;
@@ -709,10 +737,12 @@ std::pair<Move, Move> Search::GetBestMoveInternal() const
                         : GetBestChild(root_node_);
 
   Move ponder_move;
+  /*  // Doesn't seem to work for now, so disabling.
   if (best_node->HasChildren()) {
     ponder_move =
         GetBestChild(best_node)->GetMove(!played_history_.IsBlackToMove());
   }
+  */
   return {best_node->GetMove(played_history_.IsBlackToMove()), ponder_move};
 }
 

diff --git a/src/mcts/search.h b/src/mcts/search.h
@@ -84,7 +84,8 @@ class Search {
   static const char* kFpuReductionStr;
   static const char* kCacheHistoryLengthStr;
   static const char* kExtraVirtualLossStr;
-  static const char* KPolicySoftmaxTempStr;
+  static const char* kPolicySoftmaxTempStr;
+  static const char* kAllowedNodeCollisionsStr;
 
  private:
   // Can run several copies of it in separate threads.
@@ -104,7 +105,7 @@ class Search {
 
   void SendUciInfo();  // Requires nodes_mutex_ to be held.
 
-  Node* PickNodeToExtend(Node* node, PositionHistory* history);
+  std::pair<Node*, bool> PickNodeToExtend(Node* node, PositionHistory* history);
   void ExtendNode(Node* node, const PositionHistory& history);
 
   mutable Mutex counters_mutex_ ACQUIRED_AFTER(nodes_mutex_);
@@ -156,7 +157,8 @@ class Search {
   const float kFpuReduction;
   const bool kCacheHistoryLength;
   const float kExtraVirtualLoss;
-  const float KPolicySoftmaxTemp;
+  const float kPolicySoftmaxTemp;
+  const int kAllowedNodeCollisions;
 };
 
 }  // namespace lczero
diff --git a/src/neural/transforms.cc → src/neural/CL/transforms.cc b/src/neural/transforms.cc → src/neural/CL/transforms.cc
@@ -27,9 +27,9 @@
 namespace lczero {
 
 std::vector<float> Transforms::ZeropadU(const std::vector<float>& U,
-                                         const int outputs, const int channels,
-                                         const int outputs_pad,
-                                         const int channels_pad) {
+                                        const int outputs, const int channels,
+                                        const int outputs_pad,
+                                        const int channels_pad) {
   // Fill with zeroes
   auto Upad = std::vector<float>(kWinogradTile * outputs_pad * channels_pad);
 
@@ -50,8 +50,8 @@ std::vector<float> Transforms::ZeropadU(const std::vector<float>& U,
 }
 
 std::vector<float> Transforms::WinogradTransformF(const std::vector<float>& f,
-                                                    const int outputs,
-                                                    const int channels) {
+                                                  const int outputs,
+                                                  const int channels) {
   // F(2x2, 3x3) Winograd filter transformation
   // transpose(G.dot(f).dot(G.transpose()))
   // U matrix is transposed for better memory layout in SGEMM
@@ -89,7 +89,7 @@ std::vector<float> Transforms::WinogradTransformF(const std::vector<float>& f,
 }
 
 void Transforms::WinogradTransformIn(const std::vector<float>& in,
-                                       std::vector<float>& V, const int C) {
+                                     std::vector<float>& V, const int C) {
   constexpr auto W = 8;
   constexpr auto H = 8;
   constexpr auto wtiles = (W + 1) / 2;
@@ -173,8 +173,8 @@ void Transforms::WinogradTransformIn(const std::vector<float>& in,
 }
 
 void Transforms::WinogradSgemm(const std::vector<float>& U,
-                                std::vector<float>& V, std::vector<float>& M,
-                                const int C, const int K) {
+                               std::vector<float>& V, std::vector<float>& M,
+                               const int C, const int K) {
   constexpr auto P = 8 * 8 / kWinogradAlpha;
 
   for (auto b = 0; b < kWinogradTile; b++) {
@@ -188,7 +188,7 @@ void Transforms::WinogradSgemm(const std::vector<float>& U,
 }
 
 void Transforms::WinogradTransformOut(const std::vector<float>& M,
-                                        std::vector<float>& Y, const int K) {
+                                      std::vector<float>& Y, const int K) {
   constexpr auto W = 8;
   constexpr auto H = 8;
   constexpr auto wtiles = (W + 1) / 2;
@@ -247,11 +247,10 @@ void Transforms::WinogradTransformOut(const std::vector<float>& M,
 }
 
 void Transforms::WinogradConvolve3(const int outputs,
-                                    const std::vector<float>& input,
-                                    const std::vector<float>& U,
-                                    std::vector<float>& V,
-                                    std::vector<float>& M,
-                                    std::vector<float>& output) {
+                                   const std::vector<float>& input,
+                                   const std::vector<float>& U,
+                                   std::vector<float>& V, std::vector<float>& M,
+                                   std::vector<float>& output) {
   constexpr unsigned int filter_len = kWinogradAlpha * kWinogradAlpha;
   const auto input_channels = U.size() / (outputs * filter_len);
 
@@ -419,7 +418,7 @@ void Transforms::OffsetBatchNormMeans(std::vector<float>& bn_means,
   // still have non-zero biases.
   // Move biases to batchnorm means to make the output match without having
   // to separately add the biases.
-  for (auto i = 0; i < bn_means.size(); i++) bn_means[i] -= biases[i];
+  for (size_t i = 0; i < bn_means.size(); i++) bn_means[i] -= biases[i];
 }
 
 void Transforms::InvertBatchNormStddev(std::vector<float>& weights) {

diff --git a/src/neural/transforms.h → src/neural/CL/transforms.h b/src/neural/transforms.h → src/neural/CL/transforms.h
diff --git a/src/neural/network_blas.cc b/src/neural/network_blas.cc
@@ -16,9 +16,9 @@
  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "neural/network.h"
+#include "neural/CL/transforms.h"
 #include "neural/factory.h"
-#include "neural/transforms.h"
+#include "neural/network.h"
 
 #include <algorithm>
 #include <cassert>
@@ -84,17 +84,8 @@ class BlasComputation : public NetworkComputation {
     constexpr int height = 8;
     constexpr int tiles = width * height / 4;
 
-    /*
-     static constexpr int NUM_VALUE_INPUT_PLANES = 32;
-     static constexpr int NUM_POLICY_INPUT_PLANES = 32;
-     static constexpr int NUM_OUTPUT_POLICY = 1858;
-     static constexpr int NUM_VALUE_CHANNELS = 128;
-     */
-
     int NUM_VALUE_INPUT_PLANES = weights_.value.bn_means.size();
     int NUM_POLICY_INPUT_PLANES = weights_.policy.bn_means.size();
-    int NUM_OUTPUT_POLICY = weights_.ip_pol_b.size();
-    int NUM_VALUE_CHANNELS = weights_.ip1_val_b.size();
 
     static constexpr auto kWinogradAlpha = 4;
     static constexpr auto kWinogradTile = kWinogradAlpha * kWinogradAlpha;
@@ -116,7 +107,7 @@ class BlasComputation : public NetworkComputation {
     std::vector<float> value_data(NUM_VALUE_INPUT_PLANES * width * height);
 
     Transforms::WinogradConvolve3(output_channels, input,
-                                   weights_.input.weights, V, M, conv_out);
+                                  weights_.input.weights, V, M, conv_out);
     Transforms::Batchnorm<64>(output_channels, conv_out,
                               weights_.input.bn_means.data(),
                               weights_.input.bn_stddivs.data());
@@ -132,15 +123,15 @@ class BlasComputation : public NetworkComputation {
       std::copy(begin(conv_in), end(conv_in), begin(res));
 
       Transforms::WinogradConvolve3(output_channels, conv_in, conv1.weights, V,
-                                     M, conv_out);
+                                    M, conv_out);
       Transforms::Batchnorm<64>(output_channels, conv_out,
                                 conv1.bn_means.data(), conv1.bn_stddivs.data());
 
       auto& conv2 = residual.conv2;
       output_channels = conv2.biases.size();
       std::swap(conv_out, conv_in);
       Transforms::WinogradConvolve3(output_channels, conv_in, conv2.weights, V,
-                                     M, conv_out);
+                                    M, conv_out);
       Transforms::Batchnorm<64>(output_channels, conv_out,
                                 conv2.bn_means.data(), conv2.bn_stddivs.data(),
                                 res.data());
@@ -214,7 +205,7 @@ class BlasNetwork : public Network {
     Transforms::InvertBatchNormStddev(input_batchnorm_stddivs);
 
     // residual blocks
-    for (auto i = 0; i < residual_blocks; i++) {
+    for (size_t i = 0; i < residual_blocks; i++) {
       auto& residual = weights_.residual[i];
       auto& conv1 = residual.conv1;
       auto& conv2 = residual.conv2;
@@ -275,4 +266,4 @@ class BlasNetwork : public Network {
 
 REGISTER_NETWORK("blas", BlasNetwork, 50)
 
-}  // namespace lc0
+}  // namespace lczero