diff --git a/Project.toml b/Project.toml
index 452978c4..f65531ec 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,6 +16,7 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 
 [compat]
 CUDA = "1, 2.4.2, 3.0.0 - 3.6.4, 3.7.1, 4, 5"
@@ -30,6 +31,7 @@ SpecialFunctions = "0.10, 1, 2"
 StaticArrays = "0.12, 1"
 Statistics = "1.6"
 julia = "1.6"
+KernelAbstractions = "0.9"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
diff --git a/src/multilayerqg.jl b/src/multilayerqg.jl
index 99668a8b..b1e9b53a 100644
--- a/src/multilayerqg.jl
+++ b/src/multilayerqg.jl
@@ -18,11 +18,13 @@ using
   LinearAlgebra,
   StaticArrays,
   Reexport,
-  DocStringExtensions
+  DocStringExtensions,
+  KernelAbstractions
 
 @reexport using FourierFlows
 
-using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft
+using FourierFlows: parsevalsum, parsevalsum2, superzeros, plan_flows_rfft, CPU, GPU
+using KernelAbstractions.Extras.LoopInfo: @unroll
 
 nothingfunction(args...) = nothing
 
@@ -111,17 +113,6 @@ function Problem(nlayers::Int,                             # number of fluid lay
              aliased_fraction = 1/3,
                             T = Float64)
 
-  if dev == GPU() && nlayers > 2
-    @warn """MultiLayerQG module is not optimized on the GPU yet for configurations with
-    3 fluid layers or more!
-
-    See issues on Github at https://github.com/FourierFlows/GeophysicalFlows.jl/issues/112
-    and https://github.com/FourierFlows/GeophysicalFlows.jl/issues/267.
-
-    To use MultiLayerQG with 3 fluid layers or more we suggest, for now, to restrict running
-    on CPU."""
-  end
-
   if nlayers == 1
     @warn """MultiLayerQG module does work for single-layer configuration but may not be as 
     optimized. We suggest using SingleLayerQG module for single-layer QG simulation unless
@@ -533,16 +524,50 @@ Compute the inverse Fourier transform of `varh` and store it in `var`.
 """
 invtransform!(var, varh, params::AbstractParams) = ldiv!(var, params.rfftplan, varh)
 
+"""
+    @kernel function PVinversion_kernel!(a, b, M, nlayers)
+
+Kernel for the PV/stream function inversion step,
+i.e., `qh = params.S * ψh` or `ψh = params.S⁻¹ qh`.
+"""
+@kernel function PVinversion_kernel!(a, b, M, ::Val{nlayers}) where nlayers
+  i, j = @index(Global, NTuple)
+
+  @unroll for k = 1:nlayers
+
+      @inbounds a[i, j, k] = 0
+
+      @unroll for m = 1:nlayers
+          @inbounds a[i, j, k] += M[i, j][k, m] * b[i, j, m]
+      end
+  
+  end
+end
+
 """
     pvfromstreamfunction!(qh, ψh, params, grid)
 
 Obtain the Fourier transform of the PV from the streamfunction `ψh` in each layer using
-`qh = params.S * ψh`.
+`qh = params.S * ψh`. We use a work layout over which the PV inversion kernel is launched.
 """
 function pvfromstreamfunction!(qh, ψh, params, grid)
-  for j=1:grid.nl, i=1:grid.nkr
-    CUDA.@allowscalar @views qh[i, j, :] .= params.S[i, j] * ψh[i, j, :]
-  end
+  # Larger workgroups are generally more efficient. For more generality, we could put an 
+  # if statement that incurs different behavior when either nkl or nl are less than 8
+  workgroup = 8, 8
+
+  # The worksize determines how many times the kernel is run
+  worksize = grid.nkr, grid.nl
+
+  # Instantiates the kernel for relevant backend device
+  backend = KernelAbstractions.get_backend(qh)
+  kernel! = PVinversion_kernel!(backend, workgroup, worksize)
+
+  # Launch the kernel
+  S, nlayers = params.S, params.nlayers
+  kernel!(qh, ψh, S, Val(nlayers))
+
+  # Ensure that no other operations occur until the kernel has finished
+  KernelAbstractions.synchronize(backend)
 
   return nothing
 end
@@ -591,12 +616,26 @@ end
     streamfunctionfrompv!(ψh, qh, params, grid)
 
 Invert the PV to obtain the Fourier transform of the streamfunction `ψh` in each layer from
-`qh` using `ψh = params.S⁻¹ qh`.
+`qh` using `ψh = params.S⁻¹ qh`. We use a work layout over which the PV inversion kernel is launched.
 """
 function streamfunctionfrompv!(ψh, qh, params, grid)
-  for j=1:grid.nl, i=1:grid.nkr
-    CUDA.@allowscalar @views ψh[i, j, :] .= params.S⁻¹[i, j] * qh[i, j, :]
-  end
+# Larger workgroups are generally more efficient. For more generality, we could put an 
+  # if statement that incurs different behavior when either nkl or nl are less than 8
+  workgroup = 8, 8
+
+  # The worksize determines how many times the kernel is run
+  worksize = grid.nkr, grid.nl
+
+  # Instantiates the kernel for relevant backend device
+  backend = KernelAbstractions.get_backend(ψh)
+  kernel! = PVinversion_kernel!(backend, workgroup, worksize)
+
+  # Launch the kernel
+  S⁻¹, nlayers = params.S⁻¹, params.nlayers
+  kernel!(ψh, qh, S⁻¹, Val(nlayers))
+
+  # Ensure that no other operations occur until the kernel has finished
+  KernelAbstractions.synchronize(backend)
 
   return nothing
 end