diff --git a/.travis.yml b/.travis.yml index ad4e5b73c..55f7c097a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,8 @@ before_install: - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi install: + - export CWD=`pwd` + - echo $CWD - export CC=$CC$VERSION - export CXX=$CXX$VERSION - echo $PATH @@ -36,11 +38,22 @@ script: - ./bootstrap.sh - mkdir build - cd build - - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none + - mkdir lime + - cd lime + - mkdir build + - cd build + - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz + - tar xf lime-1.3.2.tar.gz + - cd lime-1.3.2 + - ./configure --prefix=$CWD/build/lime/install + - make -j4 + - make install + - cd $CWD/build + - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - echo make clean - - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none + - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - make check diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index 38444c049..86d3f030b 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -51,7 +51,7 @@ namespace Grid { virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base - virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; + virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0; virtual void HermOp(const Field &in, Field &out)=0; }; @@ -309,36 +309,59 @@ namespace Grid { class SchurStaggeredOperator : public SchurOperatorBase { protected: Matrix &_Mat; + Field tmp; + RealD mass; + double tMpc; + double tIP; + double tMeo; + double taxpby_norm; + uint64_t ncall; public: - SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){}; + void Report(void) + { + std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "< { void operator()(LinearOperatorBase &Linop, const Field &src, Field &psi) { + psi.checkerboard = src.checkerboard; conformable(psi, src); @@ -69,7 +70,6 @@ class ConjugateGradient : public OperatorFunction { Linop.HermOpAndNorm(psi, mmp, d, b); - r = src - mmp; p = r; @@ -96,38 +96,44 @@ class ConjugateGradient : public OperatorFunction { << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; GridStopWatch LinalgTimer; + GridStopWatch InnerTimer; + GridStopWatch AxpyNormTimer; + GridStopWatch LinearCombTimer; GridStopWatch MatrixTimer; GridStopWatch SolverTimer; SolverTimer.Start(); int k; - for (k = 1; k <= MaxIterations; k++) { + for (k = 1; k <= MaxIterations*1000; k++) { c = cp; MatrixTimer.Start(); - Linop.HermOpAndNorm(p, mmp, d, qq); + Linop.HermOp(p, mmp); MatrixTimer.Stop(); LinalgTimer.Start(); - // RealD qqck = norm2(mmp); - // ComplexD dck = innerProduct(p,mmp); + InnerTimer.Start(); + ComplexD dc = innerProduct(p,mmp); + InnerTimer.Stop(); + d = dc.real(); a = c / d; - b_pred = a * (a * qq - d) / c; + AxpyNormTimer.Start(); cp = axpy_norm(r, -a, mmp, r); + AxpyNormTimer.Stop(); b = cp / c; - // Fuse these loops ; should be really easy - psi = a * p + psi; - p = p * b + r; - + LinearCombTimer.Start(); + parallel_for(int ss=0;ssoSites();ss++){ + vstream(psi[ss], a * p[ss] + psi[ss]); + vstream(p [ss], b * p[ss] + r[ss]); + } + LinearCombTimer.Stop(); LinalgTimer.Stop(); std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k << " residual " << cp << " target " << rsq << std::endl; - std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl; - std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl; // Stopping condition if (cp <= rsq) { @@ -148,6 +154,9 @@ class ConjugateGradient : public OperatorFunction { std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() < &Linop, const Field &src, std::vector for(int s=0;s &Linop, const Field &src, std::vector for (k=1;k<=MaxIterations;k++){ a = c /cp; + AXPYTimer.Start(); axpy(p,a,p,r); + AXPYTimer.Stop(); // Note to self - direction ps is iterated seperately // for each shift. Does not appear to have any scope @@ -180,6 +192,7 @@ void operator() (LinearOperatorBase &Linop, const Field &src, std::vector // However SAME r is used. Could load "r" and update // ALL ps[s]. 2/3 Bandwidth saving // New Kernel: Load r, vector of coeffs, vector of pointers ps + AXPYTimer.Start(); for(int s=0;s &Linop, const Field &src, std::vector } } } + AXPYTimer.Stop(); cp=c; + MatrixTimer.Start(); + //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used + // The below is faster on KNL + Linop.HermOp(p,mmp); + d=real(innerProduct(p,mmp)); - Linop.HermOpAndNorm(p,mmp,d,qq); + MatrixTimer.Stop(); + + AXPYTimer.Start(); axpy(mmp,mass[0],p,mmp); + AXPYTimer.Stop(); RealD rn = norm2(p); d += rn*mass[0]; bp=b; b=-cp/d; + AXPYTimer.Start(); c=axpy_norm(r,b,mmp,r); + AXPYTimer.Stop(); // Toggle the recurrence history bs[0] = b; iz = 1-iz; + ShiftTimer.Start(); for(int s=1;s &Linop, const Field &src, std::vector bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike } } + ShiftTimer.Stop(); for(int s=0;s &Linop, const Field &src, std::vector if ( all_converged ){ + SolverTimer.Stop(); + + std::cout< &Linop, const Field &src, std::vector RealD cn = norm2(src); std::cout< &basis,Eigen::MatrixXd& Qt,int j0, int j1, i parallel_region { - std::vector < vobj > B(Nm); // Thread private - + Vector < vobj > B; // Thread private + + PARALLEL_CRITICAL { B.resize(Nm); } + parallel_for_internal(int ss=0;ss < grid->oSites();ss++){ for(int j=j0; j strong_inline RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ - ret.checkerboard = x.checkerboard; - conformable(ret,x); - conformable(x,y); - axpy(ret,a,x,y); - return norm2(ret); + return axpy_norm_fast(ret,a,x,y); } template strong_inline RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ - ret.checkerboard = x.checkerboard; - conformable(ret,x); - conformable(x,y); - axpby(ret,a,b,x,y); - return norm2(ret); // FIXME implement parallel norm in ss loop + return axpby_norm_fast(ret,a,b,x,y); } } diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 8a3fbece9..3be4b6cb8 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -33,7 +33,7 @@ namespace Grid { // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// template inline RealD norm2(const Lattice &arg){ - ComplexD nrm = innerProduct(arg,arg); + auto nrm = innerProduct(arg,arg); return std::real(nrm); } @@ -43,31 +43,84 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ { typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; - scalar_type nrm; - GridBase *grid = left._grid; + const int pad = 8; + + ComplexD inner; + Vector sumarray(grid->SumArraySize()*pad); + + parallel_for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); + + decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation + for(int ss=myoff;ss > sumarray(grid->SumArraySize()); + inner=0.0; + for(int i=0;iSumArraySize();i++){ + inner = inner+sumarray[i*pad]; + } + right._grid->GlobalSum(inner); + return inner; +} + +///////////////////////// +// Fast axpby_norm +// z = a x + b y +// return norm z +///////////////////////// +template strong_inline RealD +axpy_norm_fast(Lattice &z,sobj a,const Lattice &x,const Lattice &y) +{ + sobj one(1.0); + return axpby_norm_fast(z,a,one,x,y); +} + +template strong_inline RealD +axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Lattice &y) +{ + const int pad = 8; + z.checkerboard = x.checkerboard; + conformable(z,x); + conformable(x,y); + + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + RealD nrm; + + GridBase *grid = x._grid; + + Vector sumarray(grid->SumArraySize()*pad); parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; - GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); + GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff); - decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation + // private to thread; sub summation + decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; for(int ss=myoff;ssSumArraySize();i++){ - vvnrm = vvnrm+sumarray[i]; + nrm = nrm+sumarray[i*pad]; } - nrm = Reduce(vvnrm);// sum across simd - right._grid->GlobalSum(nrm); - return nrm; + z._grid->GlobalSum(nrm); + return nrm; } + template inline auto sum(const LatticeUnaryExpression & expr) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 08b7b9b46..a60fe962d 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -431,14 +431,20 @@ PARALLEL_CRITICAL MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0); } - std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl; + std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl; ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr == 0); - std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl; + std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl; ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr == 0); + MPI_Offset os; + MPI_File_get_position(fh, &os); + MPI_File_get_byte_offset(fh, os, &disp); + offset = disp; + + MPI_File_close(&fh); MPI_Type_free(&fileArray); MPI_Type_free(&localArray); @@ -448,7 +454,7 @@ PARALLEL_CRITICAL } else { std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : " - << iodata.size() * sizeof(fobj) << " bytes" << std::endl; + << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl; std::ofstream fout; fout.exceptions ( std::fstream::failbit | std::fstream::badbit ); diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h index 90c055460..33af6c3d2 100644 --- a/lib/parallelIO/IldgIO.h +++ b/lib/parallelIO/IldgIO.h @@ -182,6 +182,11 @@ class GridLimeReader : public BinaryIO { { filename= _filename; File = fopen(filename.c_str(), "r"); + if (File == nullptr) + { + std::cerr << "cannot open file '" << filename << "'" << std::endl; + abort(); + } LimeR = limeCreateReader(File); } ///////////////////////////////////////////// diff --git a/lib/perfmon/Timer.h b/lib/perfmon/Timer.h index 392ccc1d3..4d32ee526 100644 --- a/lib/perfmon/Timer.h +++ b/lib/perfmon/Timer.h @@ -49,7 +49,8 @@ inline double usecond(void) { typedef std::chrono::system_clock GridClock; typedef std::chrono::time_point GridTimePoint; -typedef std::chrono::milliseconds GridTime; +typedef std::chrono::milliseconds GridMillisecs; +typedef std::chrono::microseconds GridTime; typedef std::chrono::microseconds GridUsecs; inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time) @@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli stream << time.count()<<" ms"; return stream; } +inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time) +{ + stream << time.count()<<" usec"; + return stream; +} class GridStopWatch { private: diff --git a/lib/qcd/action/fermion/FermionOperator.h b/lib/qcd/action/fermion/FermionOperator.h index 5be36f137..cb12794f6 100644 --- a/lib/qcd/action/fermion/FermionOperator.h +++ b/lib/qcd/action/fermion/FermionOperator.h @@ -63,9 +63,12 @@ namespace Grid { virtual RealD M (const FermionField &in, FermionField &out)=0; virtual RealD Mdag (const FermionField &in, FermionField &out)=0; - // half checkerboard operaions + // Query the even even properties to make algorithmic decisions virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field + virtual int isTrivialEE(void) { return 0; }; + virtual RealD Mass(void) {return 0.0;}; + // half checkerboard operaions virtual void Meooe (const FermionField &in, FermionField &out)=0; virtual void MeooeDag (const FermionField &in, FermionField &out)=0; virtual void Mooee (const FermionField &in, FermionField &out)=0; diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index c21a07eef..e6f6e1365 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -764,7 +764,12 @@ class StaggeredImpl : public PeriodicGaugeImpl(U_ds, U, mu); + } inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &Uds, @@ -803,8 +808,10 @@ class StaggeredImpl : public PeriodicGaugeImpl(Uds, U, mu); - PokeIndex(Uds, Udag, mu + 4); + InsertGaugeField(Uds,U,mu); + InsertGaugeField(Uds,Udag,mu+4); + // PokeIndex(Uds, U, mu); + // PokeIndex(Uds, Udag, mu + 4); // 3 hop based on thin links. Crazy huh ? U = PeekIndex(Uthin, mu); @@ -816,8 +823,8 @@ class StaggeredImpl : public PeriodicGaugeImpl(UUUds, UUU, mu); - PokeIndex(UUUds, UUUdag, mu+4); + InsertGaugeField(UUUds,UUU,mu); + InsertGaugeField(UUUds,UUUdag,mu+4); } } @@ -910,6 +917,23 @@ class StaggeredImpl : public PeriodicGaugeImpllSites(); lidx++) { + + SiteScalarGaugeLink ScalarU; + SiteDoubledGaugeField ScalarUds; + + std::vector lcoor; + GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); + peekLocalSite(ScalarUds, U_ds, lcoor); + + peekLocalSite(ScalarU, U, lcoor); + ScalarUds(mu) = ScalarU(); + + } + } inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &Uds, @@ -951,23 +975,8 @@ class StaggeredImpl : public PeriodicGaugeImpllSites(); lidx++) { - SiteScalarGaugeLink ScalarU; - SiteDoubledGaugeField ScalarUds; - - std::vector lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - peekLocalSite(ScalarUds, Uds, lcoor); - - peekLocalSite(ScalarU, U, lcoor); - ScalarUds(mu) = ScalarU(); - - peekLocalSite(ScalarU, Udag, lcoor); - ScalarUds(mu + 4) = ScalarU(); - - pokeLocalSite(ScalarUds, Uds, lcoor); - } + InsertGaugeField(Uds,U,mu); + InsertGaugeField(Uds,Udag,mu+4); // 3 hop based on thin links. Crazy huh ? U = PeekIndex(Uthin, mu); @@ -979,24 +988,8 @@ class StaggeredImpl : public PeriodicGaugeImpllSites(); lidx++) { - - SiteScalarGaugeLink ScalarU; - SiteDoubledGaugeField ScalarUds; - - std::vector lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - - peekLocalSite(ScalarUds, UUUds, lcoor); - - peekLocalSite(ScalarU, UUU, lcoor); - ScalarUds(mu) = ScalarU(); - - peekLocalSite(ScalarU, UUUdag, lcoor); - ScalarUds(mu + 4) = ScalarU(); - - pokeLocalSite(ScalarUds, UUUds, lcoor); - } + InsertGaugeField(UUUds,UUU,mu); + InsertGaugeField(UUUds,UUUdag,mu+4); } } diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc index 5005b1a27..4443b4d07 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.cc @@ -44,6 +44,7 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, template ImprovedStaggeredFermion::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _c2,RealD _u0, const ImplParams &p) : Kernels(p), _grid(&Fgrid), @@ -62,6 +63,16 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GridCartesian &Fgrid, G UUUmuOdd(&Hgrid) , _tmp(&Hgrid) { + int vol4; + int LLs=1; + c1=_c1; + c2=_c2; + u0=_u0; + vol4= _grid->oSites(); + Stencil.BuildSurfaceList(LLs,vol4); + vol4= _cbgrid->oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); } template @@ -69,22 +80,10 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau GridRedBlackCartesian &Hgrid, RealD _mass, RealD _c1, RealD _c2,RealD _u0, const ImplParams &p) - : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) + : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p) { - c1=_c1; - c2=_c2; - u0=_u0; ImportGauge(_Uthin,_Ufat); } -template -ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, - GridRedBlackCartesian &Hgrid, RealD _mass, - const ImplParams &p) - : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) -{ - ImportGaugeSimple(_Utriple,_Ufat); -} - //////////////////////////////////////////////////////////// // Momentum space propagator should be @@ -98,11 +97,6 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GaugeField &_Uthin,Gaug // of above link to implmement fourier based solver. //////////////////////////////////////////////////////////// template -void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin) -{ - ImportGauge(_Uthin,_Uthin); -}; -template void ImprovedStaggeredFermion::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) { ///////////////////////////////////////////////////////////////// @@ -125,6 +119,20 @@ void ImprovedStaggeredFermion::ImportGaugeSimple(const GaugeField &_Utripl PokeIndex(Umu, -U, mu+4); } + CopyGaugeCheckerboards(); +} +template +void ImprovedStaggeredFermion::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) +{ + + Umu = _U; + UUUmu = _UUU; + CopyGaugeCheckerboards(); +} + +template +void ImprovedStaggeredFermion::CopyGaugeCheckerboards(void) +{ pickCheckerboard(Even, UmuEven, Umu); pickCheckerboard(Odd, UmuOdd , Umu); pickCheckerboard(Even, UUUmuEven,UUUmu); @@ -160,10 +168,7 @@ void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin,const PokeIndex(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); } - pickCheckerboard(Even, UmuEven, Umu); - pickCheckerboard(Odd, UmuOdd , Umu); - pickCheckerboard(Even, UUUmuEven, UUUmu); - pickCheckerboard(Odd, UUUmuOdd, UUUmu); + CopyGaugeCheckerboards(); } ///////////////////////////// @@ -322,6 +327,7 @@ void ImprovedStaggeredFermion::DhopDerivEO(GaugeField &mat, const FermionF template void ImprovedStaggeredFermion::Dhop(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=2; conformable(in._grid, _grid); // verifies full grid conformable(in._grid, out._grid); @@ -332,6 +338,7 @@ void ImprovedStaggeredFermion::Dhop(const FermionField &in, FermionField & template void ImprovedStaggeredFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=1; conformable(in._grid, _cbgrid); // verifies half grid conformable(in._grid, out._grid); // drops the cb check @@ -343,6 +350,7 @@ void ImprovedStaggeredFermion::DhopOE(const FermionField &in, FermionField template void ImprovedStaggeredFermion::DhopEO(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=1; conformable(in._grid, _cbgrid); // verifies half grid conformable(in._grid, out._grid); // drops the cb check @@ -374,25 +382,193 @@ void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, - FermionField &out, int dag) { + FermionField &out, int dag) +{ +#ifdef GRID_OMP + if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + else +#endif + DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); +} +template +void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, int dag) +{ +#ifdef GRID_OMP + Compressor compressor; + int len = U._grid->oSites(); + const int LLs = 1; + + DhopTotalTime -= usecond(); + + DhopFaceTime -= usecond(); + st.Prepare(); + st.HaloGather(in,compressor); + st.CommsMergeSHM(compressor); + DhopFaceTime += usecond(); + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// + DhopComputeTime -= usecond(); +#pragma omp parallel + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + + if (tid >= ncomms) { + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = len; + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + + // do the compute + if (dag == DaggerYes) { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + // Interior = 1; Exterior = 0; must implement for staggered + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0); + } + } else { + for (int ss = myblock; ss < myblock+myn; ++ss) { + // Interior = 1; Exterior = 0; + int sU = ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0); + } + } + } else { + st.CommunicateThreaded(); + } + } + DhopComputeTime += usecond(); + + // First to enter, last to leave timing + DhopFaceTime -= usecond(); + st.CommsMerge(compressor); + DhopFaceTime -= usecond(); + + DhopComputeTime2 -= usecond(); + if (dag == DaggerYes) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1); + } + } else { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1); + } + } + DhopComputeTime2 += usecond(); +#else + assert(0); +#endif +} + + +template +void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, int dag) +{ assert((dag == DaggerNo) || (dag == DaggerYes)); + DhopTotalTime -= usecond(); + + DhopCommTime -= usecond(); Compressor compressor; st.HaloExchange(in, compressor); + DhopCommTime += usecond(); + DhopComputeTime -= usecond(); if (dag == DaggerYes) { - PARALLEL_FOR_LOOP - for (int sss = 0; sss < in._grid->oSites(); sss++) { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); } } else { - PARALLEL_FOR_LOOP - for (int sss = 0; sss < in._grid->oSites(); sss++) { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); } } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); }; + //////////////////////////////////////////////////////////////// + // Reporting + //////////////////////////////////////////////////////////////// +template +void ImprovedStaggeredFermion::Report(void) +{ + std::vector latt = GridDefaultLatt(); + RealD volume = 1; for(int mu=0;mu_Nprocessors; + RealD NN = _grid->NodeCount(); + + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + + std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : " + << DhopCalls << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : " + << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : " + << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : " + << DhopComputeTime / DhopCalls << " us" << std::endl; + + // Average the compute time + _grid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + + RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" < +void ImprovedStaggeredFermion::ZeroCounters(void) +{ + DhopCalls = 0; + DhopTotalTime = 0; + DhopCommTime = 0; + DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); +} + + //////////////////////////////////////////////////////// // Conserved current - not yet implemented. //////////////////////////////////////////////////////// diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h index 9d5270c66..90e67199a 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS FermionField _tmp; FermionField &tmp(void) { return _tmp; } + //////////////////////////////////////// + // Performance monitoring + //////////////////////////////////////// + void Report(void); + void ZeroCounters(void); + double DhopTotalTime; + double DhopCalls; + double DhopCommTime; + double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; + /////////////////////////////////////////////////////////////// // Implement the abstract base /////////////////////////////////////////////////////////////// @@ -105,25 +117,34 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); + void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + const FermionField &in, FermionField &out, int dag); - // Constructor + ////////////////////////////////////////////////////////////////////////// + // Grid own interface Constructor + ////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, - RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, - const ImplParams &p = ImplParams()); - - ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, - GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1, RealD _c2,RealD _u0, const ImplParams &p = ImplParams()); + ////////////////////////////////////////////////////////////////////////// + // MILC constructor no gauge fields + ////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, + RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0, const ImplParams &p = ImplParams()); - // DoubleStore impl dependent - void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat); - void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat); - void ImportGauge(const GaugeField &_Uthin); + void ImportGauge (const GaugeField &_Uthin ) { assert(0); } + void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat); + void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U); + void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U); + DoubledGaugeField &GetU(void) { return Umu ; } ; + DoubledGaugeField &GetUUU(void) { return UUUmu; }; + void CopyGaugeCheckerboards(void); /////////////////////////////////////////////////////////////// // Data members require to support the functionality @@ -132,7 +153,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels, public ImprovedS // protected: public: // any other parameters of action ??? - + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } RealD mass; RealD u0; RealD c1; diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 7308ff30a..b29d803ed 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -41,8 +41,7 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, // 5d lattice for DWF. template -ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, - GridCartesian &FiveDimGrid, +ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid, GridRedBlackCartesian &FiveDimRedBlackGrid, GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, @@ -121,16 +120,74 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin, assert(FiveDimGrid._simd_layout[0] ==1); } + int LLs = FiveDimGrid._rdimensions[0]; + int vol4= FourDimGrid.oSites(); + Stencil.BuildSurfaceList(LLs,vol4); - // Allocate the required comms buffer + vol4=FourDimRedBlackGrid.oSites(); + StencilEven.BuildSurfaceList(LLs,vol4); + StencilOdd.BuildSurfaceList(LLs,vol4); +} +template +void ImprovedStaggeredFermion5D::CopyGaugeCheckerboards(void) +{ + pickCheckerboard(Even, UmuEven, Umu); + pickCheckerboard(Odd, UmuOdd , Umu); + pickCheckerboard(Even, UUUmuEven,UUUmu); + pickCheckerboard(Odd, UUUmuOdd, UUUmu); +} +template +ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _mass, + RealD _c1,RealD _c2, RealD _u0, + const ImplParams &p) : + ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid, + FourDimGrid,FourDimRedBlackGrid, + _mass,_c1,_c2,_u0,p) +{ ImportGauge(_Uthin,_Ufat); } +/////////////////////////////////////////////////// +// For MILC use; pass three link U's and 1 link U +/////////////////////////////////////////////////// template -void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin) +void ImprovedStaggeredFermion5D::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) { - ImportGauge(_Uthin,_Uthin); -}; + ///////////////////////////////////////////////////////////////// + // Trivial import; phases and fattening and such like preapplied + ///////////////////////////////////////////////////////////////// + for (int mu = 0; mu < Nd; mu++) { + + auto U = PeekIndex(_Utriple, mu); + Impl::InsertGaugeField(UUUmu,U,mu); + + U = adj( Cshift(U, mu, -3)); + Impl::InsertGaugeField(UUUmu,-U,mu+4); + + U = PeekIndex(_Ufat, mu); + Impl::InsertGaugeField(Umu,U,mu); + + U = adj( Cshift(U, mu, -1)); + Impl::InsertGaugeField(Umu,-U,mu+4); + + } + CopyGaugeCheckerboards(); +} +template +void ImprovedStaggeredFermion5D::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U) +{ + ///////////////////////////////////////////////////////////////// + // Trivial import; phases and fattening and such like preapplied + ///////////////////////////////////////////////////////////////// + Umu = _U; + UUUmu = _UUU; + CopyGaugeCheckerboards(); +} template void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) { @@ -159,10 +216,7 @@ void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin,cons PokeIndex(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); } - pickCheckerboard(Even, UmuEven, Umu); - pickCheckerboard(Odd, UmuOdd , Umu); - pickCheckerboard(Even, UUUmuEven, UUUmu); - pickCheckerboard(Odd, UUUmuOdd, UUUmu); + CopyGaugeCheckerboards(); } template void ImprovedStaggeredFermion5D::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp) @@ -223,6 +277,162 @@ void ImprovedStaggeredFermion5D::DhopDerivOE(GaugeField &mat, assert(0); } +/*CHANGE */ +template +void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ +#ifdef GRID_OMP + if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + else +#endif + DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); +} + +template +void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ +#ifdef GRID_OMP + // assert((dag==DaggerNo) ||(dag==DaggerYes)); + + Compressor compressor; + + int LLs = in._grid->_rdimensions[0]; + int len = U._grid->oSites(); + + DhopFaceTime-=usecond(); + st.Prepare(); + st.HaloGather(in,compressor); + // st.HaloExchangeOptGather(in,compressor); // Wilson compressor + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms + DhopFaceTime+=usecond(); + + double ctime=0; + double ptime=0; + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma omp parallel reduction(max:ctime) reduction(max:ptime) + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + if (tid >= ncomms) { + double start = usecond(); + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = U._grid->oSites(); // 4d vol + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + + // do the compute + if (dag == DaggerYes) { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + // Interior = 1; Exterior = 0; must implement for staggered + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<--------- + } + } else { + for (int ss = myblock; ss < myblock+myn; ++ss) { + // Interior = 1; Exterior = 0; + int sU = ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------ + } + } + ptime = usecond() - start; + } else { + double start = usecond(); + st.CommunicateThreaded(); + ctime = usecond() - start; + } + } + DhopCommTime += ctime; + DhopComputeTime+=ptime; + + // First to enter, last to leave timing + st.CollateThreads(); + + DhopFaceTime-=usecond(); + st.CommsMerge(compressor); + DhopFaceTime+=usecond(); + + DhopComputeTime2-=usecond(); + if (dag == DaggerYes) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<---------- + } + } else { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { + int sU = st.surface_list[ss]; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<---------- + } + } + DhopComputeTime2+=usecond(); +#else + assert(0); +#endif + +} + +template +void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U,DoubledGaugeField & UUU, + const FermionField &in, FermionField &out,int dag) +{ + Compressor compressor; + int LLs = in._grid->_rdimensions[0]; + + + + //double t1=usecond(); + DhopTotalTime -= usecond(); + DhopCommTime -= usecond(); + st.HaloExchange(in,compressor); + DhopCommTime += usecond(); + + DhopComputeTime -= usecond(); + // Dhop takes the 4d grid from U, and makes a 5d index for fermion + if (dag == DaggerYes) { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU=ss; + Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out); + } + } else { + parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { + int sU=ss; + Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); + } + } + DhopComputeTime += usecond(); + DhopTotalTime += usecond(); + //double t2=usecond(); + //std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl; + //std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl; + +} +/*CHANGE END*/ + +/* ORG template void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U,DoubledGaugeField & UUU, @@ -254,6 +464,7 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr DhopComputeTime += usecond(); DhopTotalTime += usecond(); } +*/ template @@ -336,6 +547,9 @@ void ImprovedStaggeredFermion5D::ZeroCounters(void) DhopTotalTime = 0; DhopCommTime = 0; DhopComputeTime = 0; + DhopFaceTime = 0; + + Stencil.ZeroCounters(); StencilEven.ZeroCounters(); StencilOdd.ZeroCounters(); diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 22fb9e7dc..936d0cb7c 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -64,6 +64,8 @@ namespace QCD { double DhopCalls; double DhopCommTime; double DhopComputeTime; + double DhopComputeTime2; + double DhopFaceTime; /////////////////////////////////////////////////////////////// // Implement the abstract base @@ -119,7 +121,27 @@ namespace QCD { FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, + int dag); + + void DhopInternalSerialComms(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + const FermionField &in, + FermionField &out, + int dag); + + // Constructors + //////////////////////////////////////////////////////////////////////////////////////////////// + // Grid internal interface -- Thin link and fat link, with coefficients + //////////////////////////////////////////////////////////////////////////////////////////////// ImprovedStaggeredFermion5D(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &FiveDimGrid, @@ -127,17 +149,37 @@ namespace QCD { GridCartesian &FourDimGrid, GridRedBlackCartesian &FourDimRedBlackGrid, double _mass, - RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, + RealD _c1, RealD _c2,RealD _u0, const ImplParams &p= ImplParams()); - - // DoubleStore - void ImportGauge(const GaugeField &_U); - void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat); + //////////////////////////////////////////////////////////////////////////////////////////////// + // MILC constructor ; triple links, no rescale factors; must be externally pre multiplied + //////////////////////////////////////////////////////////////////////////////////////////////// + ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + double _mass, + RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0, + const ImplParams &p= ImplParams()); + + // DoubleStore gauge field in operator + void ImportGauge (const GaugeField &_Uthin ) { assert(0); } + void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat); + void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U); + void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U); + // Give a reference; can be used to do an assignment or copy back out after import + // if Carleton wants to cache them and not use the ImportSimple + DoubledGaugeField &GetU(void) { return Umu ; } ; + DoubledGaugeField &GetUUU(void) { return UUUmu; }; + void CopyGaugeCheckerboards(void); /////////////////////////////////////////////////////////////// // Data members require to support the functionality /////////////////////////////////////////////////////////////// public: + + virtual int isTrivialEE(void) { return 1; }; + virtual RealD Mass(void) { return mass; } GridBase *_FourDimGrid; GridBase *_FourDimRedBlackGrid; diff --git a/lib/qcd/action/fermion/StaggeredKernels.cc b/lib/qcd/action/fermion/StaggeredKernels.cc index b6ec14c77..b7e568c21 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.cc +++ b/lib/qcd/action/fermion/StaggeredKernels.cc @@ -32,223 +32,241 @@ namespace Grid { namespace QCD { int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric; +int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute; + +#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if (SE->_is_local ) { \ + if (SE->_permute) { \ + chi_p = χ \ + permute(chi, in._odata[SE->_offset], ptype); \ + } else { \ + chi_p = &in._odata[SE->_offset]; \ + } \ + } else { \ + chi_p = &buf[SE->_offset]; \ + } \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); + +#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if (SE->_is_local ) { \ + if (SE->_permute) { \ + chi_p = χ \ + permute(chi, in._odata[SE->_offset], ptype); \ + } else { \ + chi_p = &in._odata[SE->_offset]; \ + } \ + } else if ( st.same_node[Dir] ) { \ + chi_p = &buf[SE->_offset]; \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); \ + } + +#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ + SE = st.GetEntry(ptype, Dir+skew, sF); \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + nmu++; \ + chi_p = &buf[SE->_offset]; \ + multLink(Uchi, U._odata[sU], *chi_p, Dir); \ + } template StaggeredKernels::StaggeredKernels(const ImplParams &p) : Base(p){}; -//////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////// // Generic implementation; move to different file? -//////////////////////////////////////////// - +// Int, Ext, Int+Ext cases for comms overlap +//////////////////////////////////////////////////////////////////////////////////// template -void StaggeredKernels::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out,int threeLink) { +void StaggeredKernels::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out, int dag) { const SiteSpinor *chi_p; SiteSpinor chi; SiteSpinor Uchi; StencilEntry *SE; int ptype; - int skew = 0; - if (threeLink) skew=8; - /////////////////////////// - // Xp - /////////////////////////// + int skew; - SE = st.GetEntry(ptype, Xp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; + for(int s=0;s_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp); + /////////////////////////////////////////////////// + // Only contributions from interior of our node + /////////////////////////////////////////////////// +template +void StaggeredKernels::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { + const SiteSpinor *chi_p; + SiteSpinor chi; + SiteSpinor Uchi; + StencilEntry *SE; + int ptype; + int skew ; - /////////////////////////// - // Zp - /////////////////////////// - SE = st.GetEntry(ptype, Zp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; + for(int s=0;s_offset]; + vstream(out._odata[sF], Uchi); } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp); +}; - /////////////////////////// - // Tp - /////////////////////////// - SE = st.GetEntry(ptype, Tp+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp); - /////////////////////////// - // Xm - /////////////////////////// - SE = st.GetEntry(ptype, Xm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm); + /////////////////////////////////////////////////// + // Only contributions from exterior of our node + /////////////////////////////////////////////////// +template +void StaggeredKernels::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { + const SiteSpinor *chi_p; + SiteSpinor chi; + SiteSpinor Uchi; + StencilEntry *SE; + int ptype; + int nmu=0; + int skew ; - /////////////////////////// - // Ym - /////////////////////////// - SE = st.GetEntry(ptype, Ym+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym); + for(int s=0;s_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; + if ( nmu ) { + if ( dag ) { + out._odata[sF] = out._odata[sF] - Uchi; + } else { + out._odata[sF] = out._odata[sF] + Uchi; + } } - } else { - chi_p = &buf[SE->_offset]; } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm); - - /////////////////////////// - // Tm - /////////////////////////// - SE = st.GetEntry(ptype, Tm+skew, sF); - if (SE->_is_local) { - if (SE->_permute) { - chi_p = χ - permute(chi, in._odata[SE->_offset], ptype); - } else { - chi_p = &in._odata[SE->_offset]; - } - } else { - chi_p = &buf[SE->_offset]; - } - Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm); - - vstream(out, Uchi); }; +//////////////////////////////////////////////////////////////////////////////////// +// Driving / wrapping routine to select right kernel +//////////////////////////////////////////////////////////////////////////////////// + template void StaggeredKernels::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, int sU, - const FermionField &in, FermionField &out) { - SiteSpinor naik; - SiteSpinor naive; - int oneLink =0; - int threeLink=1; + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out, + int interior,int exterior) +{ int dag=1; - switch(Opt) { -#ifdef AVX512 - //FIXME; move the sign into the Asm routine - case OptInlineAsm: - DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); - for(int s=0;s=0); assert(sU>=0); - DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); - DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink); - out._odata[sF] =naive+naik; + if ( interior && exterior ) { + DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag); + } else if ( interior ) { + DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag); + } else if ( exterior ) { + DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag); } break; default: diff --git a/lib/qcd/action/fermion/StaggeredKernels.h b/lib/qcd/action/fermion/StaggeredKernels.h index a45214d3a..79de1a684 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.h +++ b/lib/qcd/action/fermion/StaggeredKernels.h @@ -38,8 +38,9 @@ namespace QCD { class StaggeredKernelsStatic { public: enum { OptGeneric, OptHandUnroll, OptInlineAsm }; - // S-direction is INNERMOST and takes no part in the parity. - static int Opt; // these are a temporary hack + enum { CommsAndCompute, CommsThenCompute }; + static int Opt; + static int Comms; }; template class StaggeredKernels : public FermionOperator , public StaggeredKernelsStatic { @@ -53,24 +54,62 @@ template class StaggeredKernels : public FermionOperator , pub void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf, int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp); - void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, - int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink); - - - void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, - int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink); - - void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf, - int LLs, int sU, const FermionField &in, FermionField &out, int dag); - - void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf, - int LLs, int sU, const FermionField &in, FermionField &out); - - void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf, - int sF, int sU, const FermionField &in, FermionField &out); - - void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf, - int LLs, int sU, const FermionField &in, FermionField &out); + /////////////////////////////////////////////////////////////////////////////////////// + // Generic Nc kernels + /////////////////////////////////////////////////////////////////////////////////////// + void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor * buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag); + void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor * buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag); + void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor * buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag); + + /////////////////////////////////////////////////////////////////////////////////////// + // Nc=3 specific kernels + /////////////////////////////////////////////////////////////////////////////////////// + void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U,DoubledGaugeField &UUU, + SiteSpinor * buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag); + void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U,DoubledGaugeField &UUU, + SiteSpinor * buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag); + void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U,DoubledGaugeField &UUU, + SiteSpinor * buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag); + + /////////////////////////////////////////////////////////////////////////////////////// + // Asm Nc=3 specific kernels + /////////////////////////////////////////////////////////////////////////////////////// + void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U,DoubledGaugeField &UUU, + SiteSpinor * buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag); + /////////////////////////////////////////////////////////////////////////////////////////////////// + // Generic interface; fan out to right routine + /////////////////////////////////////////////////////////////////////////////////////////////////// + void DhopSite(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor * buf, int LLs, int sU, + const FermionField &in, FermionField &out, int interior=1,int exterior=1); + + void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor * buf, int LLs, int sU, + const FermionField &in, FermionField &out, int interior=1,int exterior=1); + + void DhopSite(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor * buf, int LLs, int sU, + const FermionField &in, FermionField &out, int dag, int interior,int exterior); public: diff --git a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc b/lib/qcd/action/fermion/StaggeredKernelsAsm.cc index fd881716c..990ac1261 100644 --- a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc +++ b/lib/qcd/action/fermion/StaggeredKernelsAsm.cc @@ -560,16 +560,53 @@ Author: paboyle VSTORE(2,%0,pUChi_02) \ : : "r" (out) : "memory" ); +#define nREDUCE(out) \ + asm ( \ + VADD(UChi_00,UChi_10,UChi_00) \ + VADD(UChi_01,UChi_11,UChi_01) \ + VADD(UChi_02,UChi_12,UChi_02) \ + VADD(UChi_30,UChi_20,UChi_30) \ + VADD(UChi_31,UChi_21,UChi_31) \ + VADD(UChi_32,UChi_22,UChi_32) \ + VADD(UChi_00,UChi_30,UChi_00) \ + VADD(UChi_01,UChi_31,UChi_01) \ + VADD(UChi_02,UChi_32,UChi_02) ); \ + asm (VZERO(Chi_00) \ + VSUB(UChi_00,Chi_00,UChi_00) \ + VSUB(UChi_01,Chi_00,UChi_01) \ + VSUB(UChi_02,Chi_00,UChi_02) ); \ + asm ( \ + VSTORE(0,%0,pUChi_00) \ + VSTORE(1,%0,pUChi_01) \ + VSTORE(2,%0,pUChi_02) \ + : : "r" (out) : "memory" ); + #define REDUCEa(out) \ asm ( \ VADD(UChi_00,UChi_10,UChi_00) \ VADD(UChi_01,UChi_11,UChi_01) \ VADD(UChi_02,UChi_12,UChi_02) ); \ + asm ( \ + VSTORE(0,%0,pUChi_00) \ + VSTORE(1,%0,pUChi_01) \ + VSTORE(2,%0,pUChi_02) \ + : : "r" (out) : "memory" ); + +// FIXME is sign right in the VSUB ? +#define nREDUCEa(out) \ asm ( \ - VSTORE(0,%0,pUChi_00) \ - VSTORE(1,%0,pUChi_01) \ - VSTORE(2,%0,pUChi_02) \ - : : "r" (out) : "memory" ); + VADD(UChi_00,UChi_10,UChi_00) \ + VADD(UChi_01,UChi_11,UChi_01) \ + VADD(UChi_02,UChi_12,UChi_02) ); \ + asm (VZERO(Chi_00) \ + VSUB(UChi_00,Chi_00,UChi_00) \ + VSUB(UChi_01,Chi_00,UChi_01) \ + VSUB(UChi_02,Chi_00,UChi_02) ); \ + asm ( \ + VSTORE(0,%0,pUChi_00) \ + VSTORE(1,%0,pUChi_01) \ + VSTORE(2,%0,pUChi_02) \ + : : "r" (out) : "memory" ); #define PERMUTE_DIR(dir) \ permute##dir(Chi_0,Chi_0);\ @@ -581,10 +618,9 @@ namespace QCD { template void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { assert(0); }; @@ -645,10 +681,9 @@ void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, // This is the single precision 5th direction vectorised kernel #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -685,7 +720,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCE(addr0); + if ( dag ) { + nREDUCE(addr0); + } else { + REDUCE(addr0); + } } #else assert(0); @@ -695,10 +734,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -734,7 +772,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCE(addr0); + if ( dag ) { + nREDUCE(addr0); + } else { + REDUCE(addr0); + } } #else assert(0); @@ -776,10 +818,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -832,7 +873,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, MULT_ADD_XYZT(gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCEa(addr0); + if ( dag ) { + nREDUCEa(addr0); + } else { + REDUCEa(addr0); + } } #else assert(0); @@ -841,10 +886,9 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, #include template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out) + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { #ifdef AVX512 uint64_t gauge0,gauge1,gauge2,gauge3; @@ -897,7 +941,11 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, MULT_ADD_XYZT(gauge2,gauge3); addr0 = (uint64_t) &out._odata[sF]; - REDUCEa(addr0); + if ( dag ) { + nREDUCEa(addr0); + } else { + REDUCEa(addr0); + } } #else assert(0); @@ -909,7 +957,7 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilImpl &st, DoubledGaugeField &U, \ DoubledGaugeField &UUU, \ SiteSpinor *buf, int LLs, \ - int sU, const FermionField &in, FermionField &out); + int sU, const FermionField &in, FermionField &out,int dag); KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD); KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF); diff --git a/lib/qcd/action/fermion/StaggeredKernelsHand.cc b/lib/qcd/action/fermion/StaggeredKernelsHand.cc index 7de8480cc..47ebdd86b 100644 --- a/lib/qcd/action/fermion/StaggeredKernelsHand.cc +++ b/lib/qcd/action/fermion/StaggeredKernelsHand.cc @@ -28,7 +28,6 @@ Author: paboyle /* END LEGAL */ #include -#define REGISTER #define LOAD_CHI(b) \ const SiteSpinor & ref (b[offset]); \ @@ -59,7 +58,7 @@ Author: paboyle UChi ## _1 += U_12*Chi_2;\ UChi ## _2 += U_22*Chi_2; -#define MULT_ADD(A,UChi) \ +#define MULT_ADD(U,A,UChi) \ auto & ref(U._odata[sU](A)); \ Impl::loadLinkElement(U_00,ref()(0,0)); \ Impl::loadLinkElement(U_10,ref()(1,0)); \ @@ -82,241 +81,319 @@ Author: paboyle #define PERMUTE_DIR(dir) \ - permute##dir(Chi_0,Chi_0);\ - permute##dir(Chi_1,Chi_1);\ - permute##dir(Chi_2,Chi_2); + permute##dir(Chi_0,Chi_0); \ + permute##dir(Chi_1,Chi_1); \ + permute##dir(Chi_2,Chi_2); + + +#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHI(in._odata); \ + if ( perm) { \ + PERMUTE_DIR(Perm); \ + } \ + } else { \ + LOAD_CHI(buf); \ + } + +#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \ + HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + { \ + MULT(Dir,even); \ + } + +#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \ + HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ + { \ + MULT_ADD(U,Dir,even); \ + } + + + +#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHI(in._odata); \ + if ( perm) { \ + PERMUTE_DIR(Perm); \ + } \ + } else if ( st.same_node[Dir] ) { \ + LOAD_CHI(buf); \ + } \ + if (SE->_is_local || st.same_node[Dir] ) { \ + MULT_ADD(U,Dir,even); \ + } + +#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \ + SE=st.GetEntry(ptype,Dir+skew,sF); \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ + nmu++; \ + { LOAD_CHI(buf); } \ + { MULT_ADD(U,Dir,even); } \ + } namespace Grid { namespace QCD { template -void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, - SiteSpinor *buf, int LLs, - int sU, const FermionField &in, FermionField &out, int dag) +void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U,DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { - SiteSpinor naik; - SiteSpinor naive; - int oneLink =0; - int threeLink=1; - int skew(0); - Real scale(1.0); - - if(dag) scale = -1.0; + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; + + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + + SiteSpinor result; + int offset,local,perm, ptype; + + StencilEntry *SE; + int skew; + for(int s=0;s -void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - SiteSpinor *buf, int sF, - int sU, const FermionField &in, SiteSpinor &out,int threeLink) +void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - REGISTER Simd even_0; // 12 regs on knc - REGISTER Simd even_1; - REGISTER Simd even_2; - REGISTER Simd odd_0; // 12 regs on knc - REGISTER Simd odd_1; - REGISTER Simd odd_2; - - REGISTER Simd Chi_0; // two spinor; 6 regs - REGISTER Simd Chi_1; - REGISTER Simd Chi_2; - - REGISTER Simd U_00; // two rows of U matrix - REGISTER Simd U_10; - REGISTER Simd U_20; - REGISTER Simd U_01; - REGISTER Simd U_11; - REGISTER Simd U_21; // 2 reg left. - REGISTER Simd U_02; - REGISTER Simd U_12; - REGISTER Simd U_22; - - int skew = 0; - if (threeLink) skew=8; + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; + + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + + SiteSpinor result; int offset,local,perm, ptype; + StencilEntry *SE; + int skew; - // Xp - SE=st.GetEntry(ptype,Xp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT(Xp,even); - } - - // Yp - SE=st.GetEntry(ptype,Yp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + for(int s=0;s_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Zp,even); - } +template +void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, DoubledGaugeField &UUU, + SiteSpinor *buf, int LLs, int sU, + const FermionField &in, FermionField &out,int dag) +{ + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; - // Tp - SE=st.GetEntry(ptype,Tp+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Tp,odd); - } - - // Xm - SE=st.GetEntry(ptype,Xm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Xm,even); - } - - - // Ym - SE=st.GetEntry(ptype,Ym+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Ym,odd); - } + Simd even_0; // 12 regs on knc + Simd even_1; + Simd even_2; + Simd odd_0; // 12 regs on knc + Simd odd_1; + Simd odd_2; - // Zm - SE=st.GetEntry(ptype,Zm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; + Simd Chi_0; // two spinor; 6 regs + Simd Chi_1; + Simd Chi_2; - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Zm,even); - } + Simd U_00; // two rows of U matrix + Simd U_10; + Simd U_20; + Simd U_01; + Simd U_11; + Simd U_21; // 2 reg left. + Simd U_02; + Simd U_12; + Simd U_22; + + SiteSpinor result; + int offset,local,perm, ptype; - // Tm - SE=st.GetEntry(ptype,Tm+skew,sF); - offset = SE->_offset; - local = SE->_is_local; - perm = SE->_permute; - - if ( local ) { - LOAD_CHI(in._odata); - if ( perm) { - PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... - } - } else { - LOAD_CHI(buf); - } - { - MULT_ADD(Tm,odd); - } + StencilEntry *SE; + int skew; - vstream(out()()(0),even_0+odd_0); - vstream(out()()(1),even_1+odd_1); - vstream(out()()(2),even_2+odd_2); + for(int s=0;s::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ DoubledGaugeField &U,DoubledGaugeField &UUU, \ - SiteSpinor *buf, int LLs, \ - int sU, const FermionField &in, FermionField &out, int dag); + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ + \ + template void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \ + DoubledGaugeField &U,DoubledGaugeField &UUU, \ + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ + \ + template void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \ + DoubledGaugeField &U,DoubledGaugeField &UUU, \ + SiteSpinor *buf, int LLs, int sU, \ + const FermionField &in, FermionField &out, int dag); \ -#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \ - template void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \ - SiteSpinor *buf, int sF, \ - int sU, const FermionField &in, SiteSpinor &out,int threeLink) ; DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD); DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF); DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD); DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD); -DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF); -}} +} +} + diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index 6ec2ede90..7553743e6 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -274,41 +274,16 @@ class WilsonStencil : public CartesianStencil { if ( timer4 ) std::cout << GridLogMessage << " timer4 " < same_node; - std::vector surface_list; - WilsonStencil(GridBase *grid, int npoints, int checkerboard, const std::vector &directions, const std::vector &distances) - : CartesianStencil (grid,npoints,checkerboard,directions,distances) , - same_node(npoints) + : CartesianStencil (grid,npoints,checkerboard,directions,distances) { ZeroCountersi(); - surface_list.resize(0); }; - void BuildSurfaceList(int Ls,int vol4){ - - // find same node for SHM - // Here we know the distance is 1 for WilsonStencil - for(int point=0;point_npoints;point++){ - same_node[point] = this->SameNode(point); - } - - for(int site = 0 ;site< vol4;site++){ - int local = 1; - for(int point=0;point_npoints;point++){ - if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ - local = 0; - } - } - if(local == 0) { - surface_list.push_back(site); - } - } - } template < class compressor> void HaloExchangeOpt(const Lattice &source,compressor &compress) @@ -369,23 +344,23 @@ class WilsonStencil : public CartesianStencil { int dag = compress.dag; int face_idx=0; if ( dag ) { - assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); - assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); - assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); - assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); - assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); - assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); - assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); - assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); + assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); + assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); } else { - assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); - assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); - assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); - assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); - assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); - assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); - assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); - assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); + assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index dfaa67586..bd9b1ee9f 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -348,15 +348,98 @@ void WilsonFermion::DhopDirDisp(const FermionField &in, FermionField &out, parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); } -}; - +} +/*Change starts*/ template void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) { +#ifdef GRID_OMP + if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) + DhopInternalOverlappedComms(st,lo,U,in,out,dag); + else +#endif + DhopInternalSerial(st,lo,U,in,out,dag); + +} + +template +void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) { assert((dag == DaggerNo) || (dag == DaggerYes)); +#ifdef GRID_OMP + Compressor compressor; + int len = U._grid->oSites(); + const int LLs = 1; + + st.Prepare(); + st.HaloGather(in,compressor); + st.CommsMergeSHM(compressor); +#pragma omp parallel + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + if (tid >= ncomms) { + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = len; + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + // do the compute + if (dag == DaggerYes) { + + for (int sss = myblock; sss < myblock+myn; ++sss) { + Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } else { + for (int sss = myblock; sss < myblock+myn; ++sss) { + Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } //else + + } else { + st.CommunicateThreaded(); + } + + Compressor compressor(dag); + + if (dag == DaggerYes) { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { + Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } else { + parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { + Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); + } + } + } //pragma +#else + assert(0); +#endif +}; + + +template +void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) { + assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); st.HaloExchange(in, compressor); @@ -370,6 +453,7 @@ void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, } } }; +/*Change ends */ /******************************************************************************* * Conserved current utilities for Wilson fermions, for contracting propagators diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index c0db827e2..28e009a77 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -130,6 +130,12 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); + void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + + void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); + // Constructor WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, @@ -145,6 +151,8 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { // protected: public: + virtual RealD Mass(void) { return mass; } + virtual int isTrivialEE(void) { return 1; }; RealD mass; RealD diag_mass; diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 3e58fed6c..aa046d053 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -445,8 +445,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg } } ptime = usecond() - start; - } - { + } else { double start = usecond(); st.CommunicateThreaded(); ctime = usecond() - start; diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 2369c98dd..2b1508964 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -53,7 +53,7 @@ template class WilsonKernels : public FermionOperator , public typedef FermionOperator Base; public: - + template typename std::enable_if::type DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, @@ -70,27 +70,27 @@ template class WilsonKernels : public FermionOperator , public break; #endif case OptHandUnroll: - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if(interior&&exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); - else if (interior) WilsonKernels::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out); - else if (exterior) WilsonKernels::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out); - sF++; - } - sU++; - } + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if(interior&&exterior) WilsonKernels::HandDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + sF++; + } + sU++; + } break; case OptGeneric: - for (int site = 0; site < Ns; site++) { - for (int s = 0; s < Ls; s++) { - if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); - else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); - else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); - else assert(0); - sF++; - } - sU++; - } + for (int site = 0; site < Ns; site++) { + for (int s = 0; s < Ls; s++) { + if(interior&&exterior) WilsonKernels::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); + else if (interior) WilsonKernels::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); + else if (exterior) WilsonKernels::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); + else assert(0); + sF++; + } + sU++; + } break; default: assert(0); @@ -232,6 +232,7 @@ template class WilsonKernels : public FermionOperator , public void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, int sF, int sU, const FermionField &in, FermionField &out); + void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index a79064eba..57952a261 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -150,7 +150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal std::vector _distances; std::vector _comm_buf_size; std::vector _permute_type; - + std::vector same_node; + std::vector surface_list; + Vector _entries; std::vector Packets; std::vector Mergers; @@ -201,7 +203,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int dimension = _directions[point]; int displacement = _distances[point]; - assert( (displacement==1) || (displacement==-1)); + int pd = _grid->_processors[dimension]; int fd = _grid->_fdimensions[dimension]; @@ -216,9 +218,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal if ( ! comm_dim ) return 1; int nbr_proc; - if (displacement==1) nbr_proc = 1; - else nbr_proc = pd-1; + if (displacement>0) nbr_proc = 1; + else nbr_proc = pd-1; + // FIXME this logic needs to be sorted for three link term + // assert( (displacement==1) || (displacement==-1)); + // Present hack only works for >= 4^4 subvol per node _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p); @@ -539,6 +544,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal } }; + // Move interior/exterior split into the generic stencil + // FIXME Explicit Ls in interface is a pain. Should just use a vol + void BuildSurfaceList(int Ls,int vol4){ + + // find same node for SHM + // Here we know the distance is 1 for WilsonStencil + for(int point=0;point_npoints;point++){ + same_node[point] = this->SameNode(point); + } + + for(int site = 0 ;site< vol4;site++){ + int local = 1; + for(int point=0;point_npoints;point++){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){ + local = 0; + } + } + if(local == 0) { + surface_list.push_back(site); + } + } + } + CartesianStencil(GridBase *grid, int npoints, int checkerboard, @@ -549,7 +577,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal comm_bytes_thr(npoints), comm_enter_thr(npoints), comm_leave_thr(npoints), - comm_time_thr(npoints) + comm_time_thr(npoints), + same_node(npoints) { face_table_computed=0; _npoints = npoints; @@ -557,6 +586,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal _directions = directions; _distances = distances; _unified_buffer_size=0; + surface_list.resize(0); int osites = _grid->oSites(); diff --git a/lib/util/Init.cc b/lib/util/Init.cc index 45a37a022..5d93cb64f 100644 --- a/lib/util/Init.cc +++ b/lib/util/Init.cc @@ -368,8 +368,10 @@ void Grid_init(int *argc,char ***argv) } if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute; + QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute; } else { QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; + QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute; } if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){ CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent); @@ -385,6 +387,7 @@ void Grid_init(int *argc,char ***argv) if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); + assert(CartesianCommunicator::nCommThreads > 0); } if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); diff --git a/tests/core/Test_staggered5Dvec.cc b/tests/core/Test_staggered5Dvec.cc index db76f7925..2720c24c3 100644 --- a/tests/core/Test_staggered5Dvec.cc +++ b/tests/core/Test_staggered5Dvec.cc @@ -141,6 +141,7 @@ int main (int argc, char ** argv) t1=usecond(); std::cout< + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + const int Ls=16; + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds({1,2,3,4}); + + GridParallelRNG pRNG4(UGrid); + GridParallelRNG pRNG5(FGrid); + pRNG4.SeedFixedIntegers(seeds); + pRNG5.SeedFixedIntegers(seeds); + + typedef typename ImprovedStaggeredFermion5DF::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DF::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DF::ImplParams params; + + FermionField src (FGrid); + random(pRNG5,src); + /* + std::vector site({0,1,2,0,0}); + ColourVector cv = zero; + cv()()(0)=1.0; + src = zero; + pokeSite(cv,src,site); + */ + FermionField result(FGrid); result=zero; + FermionField tmp(FGrid); tmp=zero; + FermionField err(FGrid); tmp=zero; + FermionField phi (FGrid); random(pRNG5,phi); + FermionField chi (FGrid); random(pRNG5,chi); + + LatticeGaugeFieldF Umu(UGrid); + SU3::HotConfiguration(pRNG4,Umu); + + /* + for(int mu=1;mu<4;mu++){ + auto tmp = PeekIndex(Umu,mu); + tmp = zero; + PokeIndex(Umu,tmp,mu); + } + */ + double volume=Ls; + for(int mu=0;mu HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); @@ -87,14 +95,26 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "****************************************************************** "< HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d); FermionField result4d_o(UrbGrid); + double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146 result4d_o=zero; - CG(HermOp4d,src4d_o,result4d_o); + { + double t1=usecond(); + CG(HermOp4d,src4d_o,result4d_o); + double t2=usecond(); + double ncall=CG.IterationsToComplete; + double flops = deodoe_flops * ncall; + std::cout< HermOp(Ds); ConjugateGradient CG(1.0e-8,10000); @@ -86,11 +95,23 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "****************************************************************** "< HermOp4d(Ds4d); FermionField src4d(UGrid); random(pRNG,src4d); FermionField result4d(UGrid); result4d=zero; - CG(HermOp4d,src4d,result4d); + + double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146 + { + double t1=usecond(); + CG(HermOp4d,src4d,result4d); + double t2=usecond(); + double ncall=CG.IterationsToComplete; + double flops = deodoe_flops * ncall; + std::cout< HermOpEO(Ds); ConjugateGradient CG(1.0e-8,10000); + double t1=usecond(); CG(HermOpEO,src_o,res_o); + double t2=usecond(); + + // Schur solver: uses DeoDoe => volume * 1146 + double ncall=CG.IterationsToComplete; + double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146 + + std::cout< +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +int main (int argc, char ** argv) +{ + typedef typename ImprovedStaggeredFermionR::FermionField FermionField; + typename ImprovedStaggeredFermionR::ImplParams params; + + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(&Grid); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + + LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + + double volume=1; + for(int mu=0;mu HermOpEO(Ds); + + FermionField src(&Grid); random(pRNG,src); + FermionField src_o(&RBGrid); + pickCheckerboard(Odd,src_o,src); + + + ///////////////////////////////// + //Multishift CG + ///////////////////////////////// + std::vector result(degree,&RBGrid); + ConjugateGradientMultiShift MSCG(10000,Sqrt); + + double deodoe_flops=(1205+15*degree)*volume; // == 66*16 + == 1146 + + double t1=usecond(); + MSCG(HermOpEO,src_o,result); + double t2=usecond(); + double ncall=MSCG.IterationsToComplete; + double flops = deodoe_flops * ncall; + std::cout<