From 266d173669bdf297b7035dac75fcd12ef03dc4d7 Mon Sep 17 00:00:00 2001
From: HanatoK <summersnow9403@gmail.com>
Date: Tue, 8 Oct 2024 14:25:26 -0500
Subject: [PATCH] Determine dl, dq or ds by compile-time template options

---
 src/colvar_rotation_derivative.h | 41 +++++++++++++++++---------------
 src/colvaratoms.cpp              |  2 +-
 src/colvarcomp_distances.cpp     |  4 ++--
 src/colvarcomp_rotations.cpp     | 16 ++++++-------
 4 files changed, 33 insertions(+), 30 deletions(-)
diff --git a/src/colvar_rotation_derivative.h b/src/colvar_rotation_derivative.h
index ff409d06c..17bbd0b85 100644
--- a/src/colvar_rotation_derivative.h
+++ b/src/colvar_rotation_derivative.h
@@ -9,7 +9,7 @@
 template <typename T, typename std::enable_if<std::is_same<T, cvm::atom_pos>::value, bool>::type = true>
 inline void read_atom_coord(
   size_t ia, const std::vector<T>& pos,
-  cvm::real* x, cvm::real* y, cvm::real* z) {
+  cvm::real* __restrict__ x, cvm::real* __restrict__ y, cvm::real* __restrict__ z) {
   *x = pos[ia].x;
   *y = pos[ia].y;
   *z = pos[ia].z;
@@ -18,7 +18,7 @@ inline void read_atom_coord(
 template <typename T, typename std::enable_if<std::is_same<T, cvm::atom>::value, bool>::type = true>
 inline void read_atom_coord(
   size_t ia, const std::vector<T>& pos,
-  cvm::real* x, cvm::real* y, cvm::real* z) {
+  cvm::real* __restrict__ x, cvm::real* __restrict__ y, cvm::real* __restrict__ z) {
   *x = pos[ia].pos.x;
   *y = pos[ia].pos.y;
   *z = pos[ia].pos.z;
@@ -327,12 +327,13 @@ struct rotation_derivative {
     *  @param[out] dq0_out The output of derivative of Q
     *  @param[out] ds_out  The output of derivative of overlap matrix S
     */
+  template <bool use_dl, bool use_dq, bool use_ds>
   void calc_derivative_impl(
     const cvm::rvector (&ds)[4][4],
-    cvm::rvector* const dl0_out,
-    cvm::vector1d<cvm::rvector>* const dq0_out,
-    cvm::matrix2d<cvm::rvector>* const ds_out) const {
-    if (ds_out != nullptr) {
+    cvm::rvector* __restrict__ const dl0_out,
+    cvm::vector1d<cvm::rvector>* __restrict__ const dq0_out,
+    cvm::matrix2d<cvm::rvector>* __restrict__ const ds_out) const {
+    if (use_ds) {
       // this code path is for debug_gradients, so not necessary to unroll the loop
       *ds_out = cvm::matrix2d<cvm::rvector>(4, 4);
       for (int i = 0; i < 4; ++i) {
@@ -341,7 +342,7 @@ struct rotation_derivative {
         }
       }
     }
-    if (dl0_out != nullptr) {
+    if (use_dl) {
       /* manually loop unrolling of the following loop:
         dl0_1.reset();
         for (size_t i = 0; i < 4; i++) {
@@ -367,7 +368,7 @@ struct rotation_derivative {
                  tmp_Q0Q0[3][2] * ds[3][2] +
                  tmp_Q0Q0[3][3] * ds[3][3];
     }
-    if (dq0_out != nullptr) {
+    if (use_dq) {
       // we can skip this check if a fixed-size array is used
       if (dq0_out->size() != 4) dq0_out->resize(4);
       /* manually loop unrolling of the following loop:
@@ -462,11 +463,12 @@ struct rotation_derivative {
     *  @param[out] ds_1_out  The output of derivative of overlap matrix S with
     *                        respect to ia-th atom of group 1
     */
+  template <bool use_dl, bool use_dq, bool use_ds>
   void calc_derivative_wrt_group1(
-    size_t ia, cvm::rvector* const dl0_1_out = nullptr,
-    cvm::vector1d<cvm::rvector>* const dq0_1_out = nullptr,
-    cvm::matrix2d<cvm::rvector>* const ds_1_out = nullptr) const {
-      if (dl0_1_out == nullptr && dq0_1_out == nullptr) return;
+    size_t ia, cvm::rvector* __restrict__ const dl0_1_out = nullptr,
+    cvm::vector1d<cvm::rvector>* __restrict__ const dq0_1_out = nullptr,
+    cvm::matrix2d<cvm::rvector>* __restrict__ const ds_1_out = nullptr) const {
+      // if (dl0_1_out == nullptr && dq0_1_out == nullptr) return;
       cvm::real a2x, a2y, a2z;
       // we can get rid of the helper function read_atom_coord if C++17 (constexpr) is available
       read_atom_coord(ia, m_pos2, &a2x, &a2y, &a2z);
@@ -475,7 +477,7 @@ struct rotation_derivative {
         {{ 0.0,  a2z, -a2y}, { a2x, -a2y, -a2z}, { a2y,  a2x,  0.0}, { a2z,  0.0,  a2x}},
         {{-a2z,  0.0,  a2x}, { a2y,  a2x,  0.0}, {-a2x,  a2y, -a2z}, { 0.0,  a2z,  a2y}},
         {{ a2y, -a2x,  0.0}, { a2z,  0.0,  a2x}, { 0.0,  a2z,  a2y}, {-a2x, -a2y,  a2z}}};
-      calc_derivative_impl(ds_1, dl0_1_out, dq0_1_out, ds_1_out);
+      calc_derivative_impl<use_dl, use_dq, use_ds>(ds_1, dl0_1_out, dq0_1_out, ds_1_out);
     }
   /*! @brief Calculate the derivatives of S, the leading eigenvalue L and
     *         the leading eigenvector Q with respect to `m_pos2`
@@ -487,11 +489,12 @@ struct rotation_derivative {
     *  @param[out] ds_2_out  The output of derivative of overlap matrix S with
     *                        respect to ia-th atom of group 2
     */
+  template <bool use_dl, bool use_dq, bool use_ds>
   void calc_derivative_wrt_group2(
-    size_t ia, cvm::rvector* const dl0_2_out = nullptr,
-    cvm::vector1d<cvm::rvector>* const dq0_2_out = nullptr,
-    cvm::matrix2d<cvm::rvector>* const ds_2_out = nullptr) const {
-    if (dl0_2_out == nullptr && dq0_2_out == nullptr) return;
+    size_t ia, cvm::rvector* __restrict__ const dl0_2_out = nullptr,
+    cvm::vector1d<cvm::rvector>* __restrict__ const dq0_2_out = nullptr,
+    cvm::matrix2d<cvm::rvector>* __restrict__ const ds_2_out = nullptr) const {
+    // if (dl0_2_out == nullptr && dq0_2_out == nullptr) return;
     cvm::real a1x, a1y, a1z;
     // we can get rid of the helper function read_atom_coord if C++17 (constexpr) is available
     read_atom_coord(ia, m_pos1, &a1x, &a1y, &a1z);
@@ -500,7 +503,7 @@ struct rotation_derivative {
       {{ 0.0, -a1z,  a1y}, { a1x, -a1y, -a1z}, { a1y,  a1x,  0.0}, { a1z,  0.0,  a1x}},
       {{ a1z,  0.0, -a1x}, { a1y,  a1x,  0.0}, {-a1x,  a1y, -a1z}, { 0.0,  a1z,  a1y}},
       {{-a1y,  a1x,  0.0}, { a1z,  0.0,  a1x}, { 0.0,  a1z,  a1y}, {-a1x, -a1y,  a1z}}};
-    calc_derivative_impl(ds_2, dl0_2_out, dq0_2_out, ds_2_out);
+    calc_derivative_impl<use_dl, use_dq, use_ds>(ds_2, dl0_2_out, dq0_2_out, ds_2_out);
   }
 };
 
@@ -564,7 +567,7 @@ void debug_gradients(
     // cvm::real const &a1x = pos1[ia].x;
     // cvm::real const &a1y = pos1[ia].y;
     // cvm::real const &a1z = pos1[ia].z;
-    deriv.calc_derivative_wrt_group2(ia, &dl0_2, &dq0_2, &ds_2);
+    deriv.template calc_derivative_wrt_group2<true, true, true>(ia, &dl0_2, &dq0_2, &ds_2);
     // make an infitesimal move along each cartesian coordinate of
     // this atom, and solve again the eigenvector problem
     for (size_t comp = 0; comp < 3; comp++) {
diff --git a/src/colvaratoms.cpp b/src/colvaratoms.cpp
index 092bc607f..5e1ca2f44 100644
--- a/src/colvaratoms.cpp
+++ b/src/colvaratoms.cpp
@@ -1283,7 +1283,7 @@ void cvm::atom_group::calc_fit_forces_impl(
       fitting_force_grad += atom_grad;
     }
     if (B_ag_rotate) {
-      rot_deriv->calc_derivative_wrt_group1(j, nullptr, &dq0_1);
+      rot_deriv->calc_derivative_wrt_group1<false, true, false>(j, nullptr, &dq0_1);
       // multiply by {\partial q}/\partial\vec{x}_j and add it to the fit gradients
       fitting_force_grad += sum_dxdq[0] * dq0_1[0] +
                             sum_dxdq[1] * dq0_1[1] +
diff --git a/src/colvarcomp_distances.cpp b/src/colvarcomp_distances.cpp
index 1c7267f1e..6de68264c 100644
--- a/src/colvarcomp_distances.cpp
+++ b/src/colvarcomp_distances.cpp
@@ -1003,7 +1003,7 @@ void colvar::rmsd::calc_Jacobian_derivative()
     for (size_t ia = 0; ia < atoms->size(); ia++) {
 
       // Gradient of optimal quaternion wrt current Cartesian position
-      atoms->rot_deriv->calc_derivative_wrt_group1(ia, nullptr, &dq);
+      atoms->rot_deriv->calc_derivative_wrt_group1<false, true, false>(ia, nullptr, &dq);
 
       g11 = 2.0 * (atoms->rot.q)[1]*dq[1];
       g22 = 2.0 * (atoms->rot.q)[2]*dq[2];
@@ -1302,7 +1302,7 @@ void colvar::eigenvector::calc_Jacobian_derivative()
     // Gradient of optimal quaternion wrt current Cartesian position
     // trick: d(R^-1)/dx = d(R^t)/dx = (dR/dx)^t
     // we can just transpose the derivatives of the direct matrix
-    atoms->rot_deriv->calc_derivative_wrt_group1(ia, nullptr, &dq_1);
+    atoms->rot_deriv->calc_derivative_wrt_group1<false, true, false>(ia, nullptr, &dq_1);
 
     g11 = 2.0 * quat0[1]*dq_1[1];
     g22 = 2.0 * quat0[2]*dq_1[2];
diff --git a/src/colvarcomp_rotations.cpp b/src/colvarcomp_rotations.cpp
index 12dd90e33..766a0870d 100644
--- a/src/colvarcomp_rotations.cpp
+++ b/src/colvarcomp_rotations.cpp
@@ -139,7 +139,7 @@ void colvar::orientation::apply_force(colvarvalue const &force)
     cvm::vector1d<cvm::rvector> dq0_2;
     auto ag_force = atoms->get_group_force_object();
     for (size_t ia = 0; ia < atoms->size(); ia++) {
-      rot_deriv_impl->calc_derivative_wrt_group2(ia, nullptr, &dq0_2);
+      rot_deriv_impl->calc_derivative_wrt_group2<false, true, false>(ia, nullptr, &dq0_2);
       const auto f_ia = FQ[0] * dq0_2[0] +
                         FQ[1] * dq0_2[1] +
                         FQ[2] * dq0_2[2] +
@@ -208,7 +208,7 @@ void colvar::orientation_angle::calc_gradients()
   rot_deriv_impl->prepare_derivative(rotation_derivative_dldq::use_dq);
   cvm::vector1d<cvm::rvector> dq0_2;
   for (size_t ia = 0; ia < atoms->size(); ia++) {
-    rot_deriv_impl->calc_derivative_wrt_group2(ia, nullptr, &dq0_2);
+    rot_deriv_impl->calc_derivative_wrt_group2<false, true, false>(ia, nullptr, &dq0_2);
     (*atoms)[ia].grad = (dxdq0 * dq0_2[0]);
   }
 }
@@ -268,7 +268,7 @@ void colvar::orientation_proj::calc_gradients()
   rot_deriv_impl->prepare_derivative(rotation_derivative_dldq::use_dq);
   cvm::vector1d<cvm::rvector> dq0_2;
   for (size_t ia = 0; ia < atoms->size(); ia++) {
-    rot_deriv_impl->calc_derivative_wrt_group2(ia, nullptr, &dq0_2);
+    rot_deriv_impl->calc_derivative_wrt_group2<false, true, false>(ia, nullptr, &dq0_2);
     (*atoms)[ia].grad = (dxdq0 * dq0_2[0]);
   }
 }
@@ -317,7 +317,7 @@ void colvar::tilt::calc_gradients()
   cvm::vector1d<cvm::rvector> dq0_2;
   for (size_t ia = 0; ia < atoms->size(); ia++) {
     (*atoms)[ia].grad = cvm::rvector(0.0, 0.0, 0.0);
-    rot_deriv_impl->calc_derivative_wrt_group2(ia, nullptr, &dq0_2);
+    rot_deriv_impl->calc_derivative_wrt_group2<false, true, false>(ia, nullptr, &dq0_2);
     for (size_t iq = 0; iq < 4; iq++) {
       (*atoms)[ia].grad += (dxdq[iq] * dq0_2[iq]);
     }
@@ -354,7 +354,7 @@ void colvar::spin_angle::calc_gradients()
   cvm::vector1d<cvm::rvector> dq0_2;
   for (size_t ia = 0; ia < atoms->size(); ia++) {
     (*atoms)[ia].grad = cvm::rvector(0.0, 0.0, 0.0);
-    rot_deriv_impl->calc_derivative_wrt_group2(ia, nullptr, &dq0_2);
+    rot_deriv_impl->calc_derivative_wrt_group2<false, true, false>(ia, nullptr, &dq0_2);
     for (size_t iq = 0; iq < 4; iq++) {
       (*atoms)[ia].grad += (dxdq[iq] * dq0_2[iq]);
     }
@@ -402,7 +402,7 @@ void colvar::euler_phi::calc_gradients()
   rot_deriv_impl->prepare_derivative(rotation_derivative_dldq::use_dq);
   cvm::vector1d<cvm::rvector> dq0_2;
   for (size_t ia = 0; ia < atoms->size(); ia++) {
-    rot_deriv_impl->calc_derivative_wrt_group2(ia, nullptr, &dq0_2);
+    rot_deriv_impl->calc_derivative_wrt_group2<false, true, false>(ia, nullptr, &dq0_2);
     (*atoms)[ia].grad = (dxdq0 * dq0_2[0]) +
                         (dxdq1 * dq0_2[1]) +
                         (dxdq2 * dq0_2[2]) +
@@ -451,7 +451,7 @@ void colvar::euler_psi::calc_gradients()
   rot_deriv_impl->prepare_derivative(rotation_derivative_dldq::use_dq);
   cvm::vector1d<cvm::rvector> dq0_2;
   for (size_t ia = 0; ia < atoms->size(); ia++) {
-    rot_deriv_impl->calc_derivative_wrt_group2(ia, nullptr, &dq0_2);
+    rot_deriv_impl->calc_derivative_wrt_group2<false, true, false>(ia, nullptr, &dq0_2);
     (*atoms)[ia].grad = (dxdq0 * dq0_2[0]) +
                         (dxdq1 * dq0_2[1]) +
                         (dxdq2 * dq0_2[2]) +
@@ -498,7 +498,7 @@ void colvar::euler_theta::calc_gradients()
   rot_deriv_impl->prepare_derivative(rotation_derivative_dldq::use_dq);
   cvm::vector1d<cvm::rvector> dq0_2;
   for (size_t ia = 0; ia < atoms->size(); ia++) {
-    rot_deriv_impl->calc_derivative_wrt_group2(ia, nullptr, &dq0_2);
+    rot_deriv_impl->calc_derivative_wrt_group2<false, true, false>(ia, nullptr, &dq0_2);
     (*atoms)[ia].grad = (dxdq0 * dq0_2[0]) +
                         (dxdq1 * dq0_2[1]) +
                         (dxdq2 * dq0_2[2]) +