diff --git a/Project.toml b/Project.toml index 452978c4..f65531ec 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" [compat] CUDA = "1, 2.4.2, 3.0.0 - 3.6.4, 3.7.1, 4, 5" @@ -30,6 +31,7 @@ SpecialFunctions = "0.10, 1, 2" StaticArrays = "0.12, 1" Statistics = "1.6" julia = "1.6" +KernelAbstractions = "0.9" [extras] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl index 99668a8b..b1e9b53a 100644 --- a/src/multilayerqg.jl +++ b/src/multilayerqg.jl @@ -18,11 +18,13 @@ using LinearAlgebra, StaticArrays, Reexport, - DocStringExtensions + DocStringExtensions, + KernelAbstractions @reexport using FourierFlows -using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft +using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft, CPU, GPU +using KernelAbstractions.Extras.LoopInfo: @unroll nothingfunction(args...) = nothing @@ -111,17 +113,6 @@ function Problem(nlayers::Int, # number of fluid lay aliased_fraction = 1/3, T = Float64) - if dev == GPU() && nlayers > 2 - @warn """MultiLayerQG module is not optimized on the GPU yet for configurations with - 3 fluid layers or more! - - See issues on Github at https://github.com/FourierFlows/GeophysicalFlows.jl/issues/112 - and https://github.com/FourierFlows/GeophysicalFlows.jl/issues/267. - - To use MultiLayerQG with 3 fluid layers or more we suggest, for now, to restrict running - on CPU.""" - end - if nlayers == 1 @warn """MultiLayerQG module does work for single-layer configuration but may not be as optimized. We suggest using SingleLayerQG module for single-layer QG simulation unless @@ -533,16 +524,50 @@ Compute the inverse Fourier transform of `varh` and store it in `var`. """ invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh) +""" + @kernel function PVinversion_kernel!(a, b, M, nlayers) + +Kernel for the PV/stream function inversion step, +i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`. +""" +@kernel function PVinversion_kernel!(a, b, M, ::Val{nlayers}) where nlayers + i, j = @index(Global, NTuple) + + @unroll for k = 1:nlayers + + @inbounds a[i, j, k] = 0 + + @unroll for m = 1:nlayers + @inbounds a[i, j, k] += M[i, j][k, m] * b[i, j, m] + end + + end +end + """ pvfromstreamfunction!(qh, ψh, params, grid) Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using -`qh = params.S * ψh`. +`qh = params.S * ψh`. We use a work layout over which the PV inversion kernel is launched. """ function pvfromstreamfunction!(qh, ψh, params, grid) - for j=1:grid.nl, i=1:grid.nkr - CUDA.@allowscalar @views qh[i, j, :] .= params.S[i, j] * ψh[i, j, :] - end + # Larger workgroups are generally more efficient. For more generality, we could put an + # if statement that incurs different behavior when either nkl or nl are less than 8 + workgroup = 8, 8 + + # The worksize determines how many times the kernel is run + worksize = grid.nkr, grid.nl + + # Instantiates the kernel for relevant backend device + backend = KernelAbstractions.get_backend(qh) + kernel! = PVinversion_kernel!(backend, workgroup, worksize) + + # Launch the kernel + S, nlayers = params.S, params.nlayers + kernel!(qh, ψh, S, Val(nlayers)) + + # Ensure that no other operations occur until the kernel has finished + KernelAbstractions.synchronize(backend) return nothing end @@ -591,12 +616,26 @@ end streamfunctionfrompv!(ψh, qh, params, grid) Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from -`qh` using `ψh = params.S⁻¹ qh`. +`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the PV inversion kernel is launched. """ function streamfunctionfrompv!(ψh, qh, params, grid) - for j=1:grid.nl, i=1:grid.nkr - CUDA.@allowscalar @views ψh[i, j, :] .= params.S⁻¹[i, j] * qh[i, j, :] - end +# Larger workgroups are generally more efficient. For more generality, we could put an + # if statement that incurs different behavior when either nkl or nl are less than 8 + workgroup = 8, 8 + + # The worksize determines how many times the kernel is run + worksize = grid.nkr, grid.nl + + # Instantiates the kernel for relevant backend device + backend = KernelAbstractions.get_backend(ψh) + kernel! = PVinversion_kernel!(backend, workgroup, worksize) + + # Launch the kernel + S⁻¹, nlayers = params.S⁻¹, params.nlayers + kernel!(ψh, qh, S⁻¹, Val(nlayers)) + + # Ensure that no other operations occur until the kernel has finished + KernelAbstractions.synchronize(backend) return nothing end