Skip to content

Commit

Permalink
Merge pull request LeelaChessZero#3 from mooskagh/master
Browse files Browse the repository at this point in the history
Merge commits from last week
  • Loading branch information
mooskagh authored Jun 1, 2018
2 parents 5054269 + baf7a86 commit 42a9a61
Show file tree
Hide file tree
Showing 9 changed files with 767 additions and 71 deletions.
674 changes: 674 additions & 0 deletions COPYING

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ if get_option('build_backends')
if has_blas

blas_files = [
'src/neural/transforms.cc',
'src/neural/CL/transforms.cc',
'src/neural/network_blas.cc'
]

Expand Down
84 changes: 57 additions & 27 deletions src/mcts/search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ const char* Search::kFpuReductionStr = "First Play Urgency Reduction";
const char* Search::kCacheHistoryLengthStr =
"Length of history to include in cache";
const char* Search::kExtraVirtualLossStr = "Extra virtual loss";
const char* Search::KPolicySoftmaxTempStr = "Policy softmax temperature";
const char* Search::kPolicySoftmaxTempStr = "Policy softmax temperature";
const char* Search::kAllowedNodeCollisionsStr =
"Allowed node collisions, per batch";

namespace {
const int kSmartPruningToleranceNodes = 100;
Expand All @@ -70,7 +72,10 @@ void Search::PopulateUciParams(OptionsParser* options) {
"cache-history-length") = 7;
options->Add<FloatOption>(kExtraVirtualLossStr, 0.0, 100.0,
"extra-virtual-loss") = 0.0f;
options->Add<FloatOption>(KPolicySoftmaxTempStr, 0.1, 10.0, "policy-softmax-temp") = 1.0f;
options->Add<FloatOption>(kPolicySoftmaxTempStr, 0.1, 10.0,
"policy-softmax-temp") = 1.0f;
options->Add<IntOption>(kAllowedNodeCollisionsStr, 0, 1024,
"allowed-node-collisions") = 32;
}

Search::Search(const NodeTree& tree, Network* network,
Expand Down Expand Up @@ -98,7 +103,8 @@ Search::Search(const NodeTree& tree, Network* network,
kFpuReduction(options.Get<float>(kFpuReductionStr)),
kCacheHistoryLength(options.Get<int>(kCacheHistoryLengthStr)),
kExtraVirtualLoss(options.Get<float>(kExtraVirtualLossStr)),
KPolicySoftmaxTemp(options.Get<float>(KPolicySoftmaxTempStr)) {}
kPolicySoftmaxTemp(options.Get<float>(kPolicySoftmaxTempStr)),
kAllowedNodeCollisions(options.Get<int>(kAllowedNodeCollisionsStr)) {}

// Returns whether node was already in cache.
bool Search::AddNodeToCompute(Node* node, CachingComputation* computation,
Expand Down Expand Up @@ -160,22 +166,40 @@ void ApplyDirichletNoise(Node* node, float eps, double alpha) {
} // namespace

void Search::Worker() {
// Nodes to be extended/update counters.
std::vector<Node*> nodes_to_process;
// Nodes, for which collision happened. For those only n_in_flight have to
// be rolled back.
std::vector<Node*> node_collisions;
PositionHistory history(played_history_);

// Exit check is at the end of the loop as at least one iteration is
// necessary.
while (true) {
nodes_to_process.clear();
node_collisions.clear();
auto computation = CachingComputation(network_->NewComputation(), cache_);

// Gather nodes to process in the current batch.
for (int i = 0; i < kMiniBatchSize; ++i) {
while (static_cast<int>(nodes_to_process.size()) < kMiniBatchSize) {
// Initialize position sequence with pre-move position.
history.Trim(played_history_.GetLength());
// If there's something to do without touching slow neural net, do it.
if (i > 0 && computation.GetCacheMisses() == 0) break;
Node* node = PickNodeToExtend(root_node_, &history);
if (!nodes_to_process.empty() && computation.GetCacheMisses() == 0) break;
// Returns <Node, whether it's computable> pair. The node is not
// computable if there is collision.
auto node_and_computable = PickNodeToExtend(root_node_, &history);
Node* node = node_and_computable.first;
const bool computable = node_and_computable.second;

// If there is collision, add to a vector to undo the virtual loss later.
if (!computable) {
node_collisions.emplace_back(node);
if (static_cast<int>(node_collisions.size()) > kAllowedNodeCollisions)
break;
continue;
}

// If we hit the node that is already processed (by our batch or in
// another thread) stop gathering and process smaller batch.
if (!node) break;
Expand All @@ -194,6 +218,7 @@ void Search::Worker() {
}
}

// TODO(mooskagh) Remove prefetch into cache if node collisions work well.
// If there are requests to NN, but the batch is not full, try to prefetch
// nodes which are likely useful in future.
if (computation.GetCacheMisses() > 0 &&
Expand All @@ -219,8 +244,8 @@ void Search::Worker() {
for (Node* n : node->Children()) {
float p = computation.GetPVal(idx_in_computation,
n->GetMove().as_nn_index());
if(KPolicySoftmaxTemp != 1.0f){
p = pow(p, 1/KPolicySoftmaxTemp);
if (kPolicySoftmaxTemp != 1.0f) {
p = pow(p, 1 / kPolicySoftmaxTemp);
}
total += p;
n->SetP(p);
Expand Down Expand Up @@ -270,6 +295,14 @@ void Search::Worker() {
}
}
total_playouts_ += nodes_to_process.size();

// Remove virtual loss from node collisions.
for (Node* node : node_collisions) {
for (node = node->GetParent(); node != root_node_->GetParent();
node = node->GetParent()) {
node->CancelScoreUpdate();
}
}
}
UpdateRemainingMoves(); // Update remaining moves using smart pruning.
MaybeOutputInfo();
Expand Down Expand Up @@ -375,8 +408,8 @@ Node* GetBestChild(Node* parent) {
// * If that number is larger than 0, the one wil larger eval wins.
std::tuple<int, float, float> best(-1, 0.0, 0.0);
for (Node* node : parent->Children()) {
std::tuple<int, float, float> val(node->GetNStarted(),
node->GetQ(-10.0, 0.0), node->GetP());
std::tuple<int, float, float> val(node->GetN(), node->GetQ(-10.0, 0.0),
node->GetP());
if (val > best) {
best = val;
best_node = node;
Expand All @@ -391,7 +424,7 @@ Node* GetBestChildWithTemperature(Node* parent, float temperature) {
const float n_parent = parent->GetN();

for (Node* node : parent->Children()) {
sum += std::pow(node->GetNStarted() / n_parent, 1 / temperature);
sum += std::pow(node->GetN() / n_parent, 1 / temperature);
cumulative_sums.push_back(sum);
}

Expand Down Expand Up @@ -459,9 +492,8 @@ void Search::SendMovesStats() const {
for (Node* iter : root_node_->Children()) {
nodes.emplace_back(iter);
}
std::sort(nodes.begin(), nodes.end(), [](const Node* a, const Node* b) {
return a->GetNStarted() < b->GetNStarted();
});
std::sort(nodes.begin(), nodes.end(),
[](const Node* a, const Node* b) { return a->GetN() < b->GetN(); });

const bool is_black_to_move = played_history_.IsBlackToMove();
ThinkingInfo info;
Expand Down Expand Up @@ -610,12 +642,15 @@ void Search::ExtendNode(Node* node, const PositionHistory& history) {
for (const auto& move : legal_moves) node->CreateChild(move);
}

Node* Search::PickNodeToExtend(Node* node, PositionHistory* history) {
// Returns node and whether it should be processed.
// (false if it is a collision).
std::pair<Node*, bool> Search::PickNodeToExtend(Node* node,
PositionHistory* history) {
// Fetch the current best root node visits for possible smart pruning.
int best_node_n = 0;
{
SharedMutex::Lock lock(nodes_mutex_);
if (best_move_node_) best_node_n = best_move_node_->GetNStarted();
if (best_move_node_) best_node_n = best_move_node_->GetN();
}

// True on first iteration, false as we dive deeper.
Expand All @@ -624,17 +659,9 @@ Node* Search::PickNodeToExtend(Node* node, PositionHistory* history) {
{
SharedMutex::Lock lock(nodes_mutex_);
// Check whether we are in the leave.
if (!node->TryStartScoreUpdate()) {
// The node is currently being processed by another thread.
// Undo the increments of anschestor nodes, and return null.
for (node = node->GetParent(); node != root_node_->GetParent();
node = node->GetParent()) {
node->CancelScoreUpdate();
}
return nullptr;
}
if (!node->TryStartScoreUpdate()) return {node, false};
// Found leave, and we are the the first to visit it.
if (!node->HasChildren()) return node;
if (!node->HasChildren()) return {node, true};
}

// Now we are not in leave, we need to go deeper.
Expand All @@ -655,7 +682,8 @@ Node* Search::PickNodeToExtend(Node* node, PositionHistory* history) {
// To ensure we have at least one node to expand, always include
// current best node.
if (iter != best_move_node_ &&
remaining_playouts_ < best_node_n - iter->GetNStarted()) {
remaining_playouts_ <
best_node_n - static_cast<int>(iter->GetN())) {
continue;
}
++possible_moves;
Expand Down Expand Up @@ -709,10 +737,12 @@ std::pair<Move, Move> Search::GetBestMoveInternal() const
: GetBestChild(root_node_);

Move ponder_move;
/* // Doesn't seem to work for now, so disabling.
if (best_node->HasChildren()) {
ponder_move =
GetBestChild(best_node)->GetMove(!played_history_.IsBlackToMove());
}
*/
return {best_node->GetMove(played_history_.IsBlackToMove()), ponder_move};
}

Expand Down
8 changes: 5 additions & 3 deletions src/mcts/search.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ class Search {
static const char* kFpuReductionStr;
static const char* kCacheHistoryLengthStr;
static const char* kExtraVirtualLossStr;
static const char* KPolicySoftmaxTempStr;
static const char* kPolicySoftmaxTempStr;
static const char* kAllowedNodeCollisionsStr;

private:
// Can run several copies of it in separate threads.
Expand All @@ -104,7 +105,7 @@ class Search {

void SendUciInfo(); // Requires nodes_mutex_ to be held.

Node* PickNodeToExtend(Node* node, PositionHistory* history);
std::pair<Node*, bool> PickNodeToExtend(Node* node, PositionHistory* history);
void ExtendNode(Node* node, const PositionHistory& history);

mutable Mutex counters_mutex_ ACQUIRED_AFTER(nodes_mutex_);
Expand Down Expand Up @@ -156,7 +157,8 @@ class Search {
const float kFpuReduction;
const bool kCacheHistoryLength;
const float kExtraVirtualLoss;
const float KPolicySoftmaxTemp;
const float kPolicySoftmaxTemp;
const int kAllowedNodeCollisions;
};

} // namespace lczero
29 changes: 14 additions & 15 deletions src/neural/transforms.cc → src/neural/CL/transforms.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
namespace lczero {

std::vector<float> Transforms::ZeropadU(const std::vector<float>& U,
const int outputs, const int channels,
const int outputs_pad,
const int channels_pad) {
const int outputs, const int channels,
const int outputs_pad,
const int channels_pad) {
// Fill with zeroes
auto Upad = std::vector<float>(kWinogradTile * outputs_pad * channels_pad);

Expand All @@ -50,8 +50,8 @@ std::vector<float> Transforms::ZeropadU(const std::vector<float>& U,
}

std::vector<float> Transforms::WinogradTransformF(const std::vector<float>& f,
const int outputs,
const int channels) {
const int outputs,
const int channels) {
// F(2x2, 3x3) Winograd filter transformation
// transpose(G.dot(f).dot(G.transpose()))
// U matrix is transposed for better memory layout in SGEMM
Expand Down Expand Up @@ -89,7 +89,7 @@ std::vector<float> Transforms::WinogradTransformF(const std::vector<float>& f,
}

void Transforms::WinogradTransformIn(const std::vector<float>& in,
std::vector<float>& V, const int C) {
std::vector<float>& V, const int C) {
constexpr auto W = 8;
constexpr auto H = 8;
constexpr auto wtiles = (W + 1) / 2;
Expand Down Expand Up @@ -173,8 +173,8 @@ void Transforms::WinogradTransformIn(const std::vector<float>& in,
}

void Transforms::WinogradSgemm(const std::vector<float>& U,
std::vector<float>& V, std::vector<float>& M,
const int C, const int K) {
std::vector<float>& V, std::vector<float>& M,
const int C, const int K) {
constexpr auto P = 8 * 8 / kWinogradAlpha;

for (auto b = 0; b < kWinogradTile; b++) {
Expand All @@ -188,7 +188,7 @@ void Transforms::WinogradSgemm(const std::vector<float>& U,
}

void Transforms::WinogradTransformOut(const std::vector<float>& M,
std::vector<float>& Y, const int K) {
std::vector<float>& Y, const int K) {
constexpr auto W = 8;
constexpr auto H = 8;
constexpr auto wtiles = (W + 1) / 2;
Expand Down Expand Up @@ -247,11 +247,10 @@ void Transforms::WinogradTransformOut(const std::vector<float>& M,
}

void Transforms::WinogradConvolve3(const int outputs,
const std::vector<float>& input,
const std::vector<float>& U,
std::vector<float>& V,
std::vector<float>& M,
std::vector<float>& output) {
const std::vector<float>& input,
const std::vector<float>& U,
std::vector<float>& V, std::vector<float>& M,
std::vector<float>& output) {
constexpr unsigned int filter_len = kWinogradAlpha * kWinogradAlpha;
const auto input_channels = U.size() / (outputs * filter_len);

Expand Down Expand Up @@ -419,7 +418,7 @@ void Transforms::OffsetBatchNormMeans(std::vector<float>& bn_means,
// still have non-zero biases.
// Move biases to batchnorm means to make the output match without having
// to separately add the biases.
for (auto i = 0; i < bn_means.size(); i++) bn_means[i] -= biases[i];
for (size_t i = 0; i < bn_means.size(); i++) bn_means[i] -= biases[i];
}

void Transforms::InvertBatchNormStddev(std::vector<float>& weights) {
Expand Down
File renamed without changes.
23 changes: 7 additions & 16 deletions src/neural/network_blas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
along with Leela Chess. If not, see <http://www.gnu.org/licenses/>.
*/

#include "neural/network.h"
#include "neural/CL/transforms.h"
#include "neural/factory.h"
#include "neural/transforms.h"
#include "neural/network.h"

#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -84,17 +84,8 @@ class BlasComputation : public NetworkComputation {
constexpr int height = 8;
constexpr int tiles = width * height / 4;

/*
static constexpr int NUM_VALUE_INPUT_PLANES = 32;
static constexpr int NUM_POLICY_INPUT_PLANES = 32;
static constexpr int NUM_OUTPUT_POLICY = 1858;
static constexpr int NUM_VALUE_CHANNELS = 128;
*/

int NUM_VALUE_INPUT_PLANES = weights_.value.bn_means.size();
int NUM_POLICY_INPUT_PLANES = weights_.policy.bn_means.size();
int NUM_OUTPUT_POLICY = weights_.ip_pol_b.size();
int NUM_VALUE_CHANNELS = weights_.ip1_val_b.size();

static constexpr auto kWinogradAlpha = 4;
static constexpr auto kWinogradTile = kWinogradAlpha * kWinogradAlpha;
Expand All @@ -116,7 +107,7 @@ class BlasComputation : public NetworkComputation {
std::vector<float> value_data(NUM_VALUE_INPUT_PLANES * width * height);

Transforms::WinogradConvolve3(output_channels, input,
weights_.input.weights, V, M, conv_out);
weights_.input.weights, V, M, conv_out);
Transforms::Batchnorm<64>(output_channels, conv_out,
weights_.input.bn_means.data(),
weights_.input.bn_stddivs.data());
Expand All @@ -132,15 +123,15 @@ class BlasComputation : public NetworkComputation {
std::copy(begin(conv_in), end(conv_in), begin(res));

Transforms::WinogradConvolve3(output_channels, conv_in, conv1.weights, V,
M, conv_out);
M, conv_out);
Transforms::Batchnorm<64>(output_channels, conv_out,
conv1.bn_means.data(), conv1.bn_stddivs.data());

auto& conv2 = residual.conv2;
output_channels = conv2.biases.size();
std::swap(conv_out, conv_in);
Transforms::WinogradConvolve3(output_channels, conv_in, conv2.weights, V,
M, conv_out);
M, conv_out);
Transforms::Batchnorm<64>(output_channels, conv_out,
conv2.bn_means.data(), conv2.bn_stddivs.data(),
res.data());
Expand Down Expand Up @@ -214,7 +205,7 @@ class BlasNetwork : public Network {
Transforms::InvertBatchNormStddev(input_batchnorm_stddivs);

// residual blocks
for (auto i = 0; i < residual_blocks; i++) {
for (size_t i = 0; i < residual_blocks; i++) {
auto& residual = weights_.residual[i];
auto& conv1 = residual.conv1;
auto& conv2 = residual.conv2;
Expand Down Expand Up @@ -275,4 +266,4 @@ class BlasNetwork : public Network {

REGISTER_NETWORK("blas", BlasNetwork, 50)

} // namespace lc0
} // namespace lczero
Loading

0 comments on commit 42a9a61

Please sign in to comment.