eth-cscs · Dvegrod · Jul 25, 2024 · Jul 25, 2024
diff --git a/src/ImplicitGlobalGrid.jl b/src/ImplicitGlobalGrid.jl
@@ -58,4 +58,8 @@ include("select_device.jl")
 include("tools.jl")
 include("update_halo.jl")
 
+## Multi-instance calling
+include("instance_switch.jl")
+include("init_global_grid_instance.jl")
+
 end
diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
@@ -94,7 +94,14 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
     else
         if (!MPI.Initialized()) error("MPI has not been initialized beforehand. Remove the argument 'init_MPI=false'."); end  # Ensure that MPI is always initialized after init_global_grid().
     end
-    nprocs    = MPI.Comm_size(comm);
+    # Set single instance
+    if !is_instance_number_defined()
+        set_single_instance()
+    end
+    if is_multi_instance()
+        error("Tried to make a single global instance, given that a multi instance environment has been initialized before")
+    end
+    nprocs = MPI.Comm_size(comm)
     MPI.Dims_create!(nprocs, dims);
     comm_cart = MPI.Cart_create(comm, dims, periods, reorder);
     me        = MPI.Comm_rank(comm_cart);

diff --git a/src/init_global_grid_instance.jl b/src/init_global_grid_instance.jl
@@ -0,0 +1,115 @@
+export init_global_grid_instance
+
+"""
+    init_global_grid_intance(nx, ny, nz)
+    global_grid = init_global_grid(nx, ny, nz; <keyword arguments>)
+
+Initialize a Cartesian grid of MPI processes (and also MPI itself by default) defining implicitely a global grid.
+
+# Arguments
+- {`nx`|`ny`|`nz`}`::Integer`: the number of elements of the local grid in dimension {x|y|z}.
+- {`dimx`|`dimy`|`dimz`}`::Integer=0`: the desired number of processes in dimension {x|y|z}. By default, (value `0`) the process topology is created as compact as possible with the given constraints. This is handled by the MPI implementation which is installed on your system. For more information, refer to the specifications of `MPI_Dims_create` in the corresponding documentation.
+- {`periodx`|`periody`|`periodz`}`::Integer=0`: whether the grid is periodic (`1`) or not (`0`) in dimension {x|y|z}.
+- `quiet::Bool=false`: whether to suppress printing information like the size of the global grid (`true`) or not (`false`).
+!!! note "Advanced keyword arguments"
+    - `overlaps::Tuple{Int,Int,Int}=(2,2,2)`: the number of elements adjacent local grids overlap in dimension x, y and z. By default (value `(2,2,2)`), an array `A` of size (`nx`, `ny`, `nz`) on process 1 (`A_1`) overlaps the corresponding array `A` on process 2 (`A_2`) by `2` indices if the two processes are adjacent. E.g., if `overlaps[1]=2` and process 2 is the right neighbor of process 1 in dimension x, then `A_1[end-1:end,:,:]` overlaps `A_2[1:2,:,:]`. That means, after every call `update_halo!(A)`, we have `all(A_1[end-1:end,:,:] .== A_2[1:2,:,:])` (`A_1[end,:,:]` is the halo of process 1 and `A_2[1,:,:]` is the halo of process 2). The analog applies for the dimensions y and z.
+    - `halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2)`: the default width of an array's halo in dimension x, y and z (must be greater than 1). The default can be overwritten per array in the function [`update_halo`](@ref).
+    - `disp::Integer=1`:  the displacement argument to `MPI.Cart_shift` in order to determine the neighbors.
+    - `reorder::Integer=1`: the reorder argument to `MPI.Cart_create` in order to create the Cartesian process topology.
+    - `comm::MPI.Comm=MPI.COMM_WORLD`: the input communicator argument to `MPI.Cart_create` in order to create the Cartesian process topology.
+    - `init_MPI::Bool=false`:  whether to initialize MPI (`true`) or not (`false`). CAUTION on this init is FALSE by default.
+    - `device_type::String="auto"`: the type of the device to be used if available: `"CUDA"`, `"AMDGPU"`, `"none"` or `"auto"`. Set `device_type="none"` if you want to use only CPUs on a system having also GPUs. If `device_type` is `"auto"` (default), it is automatically determined, depending on which of the modules used for programming the devices (CUDA.jl or AMDGPU.jl) was imported before ImplicitGlobalGrid; if both were imported, an error will be given if `device_type` is set as `"auto"`.
+    - `select_device::Bool=true`: whether to automatically select the device (GPU) (`true`) or not (`false`) if CUDA or AMDGPU was imported and `device_type` is not `"none"`. If `true`, it selects the device corresponding to the node-local MPI rank. This method of device selection suits both single and multi-device compute nodes and is recommended in general. It is also the default method of device selection of the *function* [`select_device`](@ref).
+    For more information, refer to the documentation of MPI.jl / MPI.
+    - `switch::Bool=false`: whether to switch the focused global grid by this new one or not.
+
+# Return values
+- `global_grid`: a struct that represents an implicit global grid instance
+
+See also: [`finalize_global_grid`](@ref), [`init_global_grid`](@ref)
+"""
+function init_global_grid_instance(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2, 2, 2), halowidths::Tuple{Int,Int,Int}=max.(1, overlaps .÷ 2), disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=false, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false, switch::Bool=false)::GlobalGrid
+
+    set_cuda_loaded()
+    set_cuda_functional()
+    set_amdgpu_loaded()
+    set_amdgpu_functional()
+    nxyz              = [nx, ny, nz];
+    dims              = [dimx, dimy, dimz];
+    periods           = [periodx, periody, periodz];
+    overlaps          = [overlaps...];
+    halowidths        = [halowidths...];
+    cuda_enabled      = false
+    amdgpu_enabled    = false
+    cudaaware_MPI     = [false, false, false]
+    amdgpuaware_MPI   = [false, false, false]
+    loopvectorization = [false, false, false]
+    if haskey(ENV, "IGG_CUDAAWARE_MPI") cudaaware_MPI .= (parse(Int64, ENV["IGG_CUDAAWARE_MPI"]) > 0); end
+    if haskey(ENV, "IGG_ROCMAWARE_MPI") amdgpuaware_MPI .= (parse(Int64, ENV["IGG_ROCMAWARE_MPI"]) > 0); end
+    if haskey(ENV, "IGG_LOOPVECTORIZATION") loopvectorization .= (parse(Int64, ENV["IGG_LOOPVECTORIZATION"]) > 0); end
+    if none(cudaaware_MPI)
+        if haskey(ENV, "IGG_CUDAAWARE_MPI_DIMX") cudaaware_MPI[1] = (parse(Int64, ENV["IGG_CUDAAWARE_MPI_DIMX"]) > 0); end
+        if haskey(ENV, "IGG_CUDAAWARE_MPI_DIMY") cudaaware_MPI[2] = (parse(Int64, ENV["IGG_CUDAAWARE_MPI_DIMY"]) > 0); end
+        if haskey(ENV, "IGG_CUDAAWARE_MPI_DIMZ") cudaaware_MPI[3] = (parse(Int64, ENV["IGG_CUDAAWARE_MPI_DIMZ"]) > 0); end
+    end
+    if none(amdgpuaware_MPI)
+        if haskey(ENV, "IGG_ROCMAWARE_MPI_DIMX") amdgpuaware_MPI[1] = (parse(Int64, ENV["IGG_ROCMAWARE_MPI_DIMX"]) > 0); end
+        if haskey(ENV, "IGG_ROCMAWARE_MPI_DIMY") amdgpuaware_MPI[2] = (parse(Int64, ENV["IGG_ROCMAWARE_MPI_DIMY"]) > 0); end
+        if haskey(ENV, "IGG_ROCMAWARE_MPI_DIMZ") amdgpuaware_MPI[3] = (parse(Int64, ENV["IGG_ROCMAWARE_MPI_DIMZ"]) > 0); end
+    end
+    if all(loopvectorization)
+        if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMX") loopvectorization[1] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMX"]) > 0); end
+        if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMY") loopvectorization[2] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMY"]) > 0); end
+        if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMZ") loopvectorization[3] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMZ"]) > 0); end
+    end
+    if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end
+    if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && cuda_functional() && amdgpu_loaded() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded and functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
+    if (device_type != DEVICE_TYPE_NONE)
+        if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_loaded() && cuda_functional()  end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() && amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
+    end
+    if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end
+    if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end
+    if (any(periods .∉ ((0,1),))) error("Invalid arguments: periodx, periody, and periodz must be either 0 or 1."); end
+    if (any(halowidths .< 1)) error("Invalid arguments: halowidths cannot be less than 1."); end
+    if (nx==1) error("Invalid arguments: nx can never be 1.") end
+    if (ny==1 && nz>1) error("Invalid arguments: ny cannot be 1 if nz is greater than 1.") end
+    if (any((nxyz .== 1) .& (dims .>1 ))) error("Incoherent arguments: if nx, ny, or nz is 1, then the corresponding dimx, dimy or dimz must not be set (or set 0 or 1)."); end
+    if (any((nxyz .< 2 .* overlaps .- 1) .& (periods .> 0))) error("Incoherent arguments: if nx, ny, or nz is smaller than 2*overlaps[1]-1, 2*overlaps[2]-1 or 2*overlaps[3]-1, respectively, then the corresponding periodx, periody or periodz must not be set (or set 0)."); end
+    if (any((overlaps .> 0) .& (halowidths .> overlaps.÷2))) error("Incoherent arguments: if overlap is greater than 0, then halowidth cannot be greater than overlap÷2, in each dimension."); end
+    dims[(nxyz.==1).&(dims.==0)] .= 1;   # Setting any of nxyz to 1, means that the corresponding dimension must also be 1 in the global grid. Thus, the corresponding dims entry must be 1.
+    if (init_MPI)  # NOTE: init MPI only, once the input arguments have been checked.
+        if (MPI.Initialized()) error("MPI is already initialized. Set the argument 'init_MPI=false'."); end
+        MPI.Init();
+    else
+        if (!MPI.Initialized()) error("MPI has not been initialized beforehand. Remove the argument 'init_MPI=false'."); end  # Ensure that MPI is always initialized after init_global_grid().
+    end
+    # Multi instance
+    if !is_instance_number_defined()
+        set_multi_instance()
+    end
+    if !is_multi_instance()
+        error("Tried to make more than one instance where a single global instance has been initialized before (use only this init function instead of init_global_grid)")
+    end
+    nprocs    = MPI.Comm_size(comm);
+    MPI.Dims_create!(nprocs, dims);
+    comm_cart = MPI.Cart_create(comm, dims, periods, reorder);
+    me        = MPI.Comm_rank(comm_cart);
+    coords    = MPI.Cart_coords(comm_cart);
+    neighbors = fill(MPI.PROC_NULL, NNEIGHBORS_PER_DIM, NDIMS_MPI);
+    for i = 1:NDIMS_MPI
+        neighbors[:,i] .= MPI.Cart_shift(comm_cart, i-1, disp);
+    end
+    nxyz_g = dims.*(nxyz.-overlaps) .+ overlaps.*(periods.==0); # E.g. for dimension x with ol=2 and periodx=0: dimx*(nx-2)+2
+    new_gg = GlobalGrid(nxyz_g, nxyz, dims, overlaps, halowidths, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, loopvectorization, quiet);
+    if switch
+        set_global_grid(new_gg)
+    end
+    cuda_support_string   = (cuda_enabled && all(cudaaware_MPI))     ? "CUDA-aware"   : (cuda_enabled && any(cudaaware_MPI))     ? "CUDA(-aware)"   : (cuda_enabled)   ? "CUDA"   : "";
+    amdgpu_support_string = (amdgpu_enabled && all(amdgpuaware_MPI)) ? "AMDGPU-aware" : (amdgpu_enabled && any(amdgpuaware_MPI)) ? "AMDGPU(-aware)" : (amdgpu_enabled) ? "AMDGPU" : "";
+    gpu_support_string    = join(filter(!isempty, [cuda_support_string, amdgpu_support_string]), ", ");
+    support_string        = isempty(gpu_support_string) ? "none" : gpu_support_string;
+    if (!quiet && me==0) println("Made a new global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]); device support: $support_string), multi-instance is enabled" * (switch ? ", switched to new grid" : "")); end
+    if ((cuda_enabled || amdgpu_enabled) && select_device) _select_device() end
+    return new_gg; # The typical use case requires only these variables; the remaining can be obtained calling get_global_grid() if needed.
+end
diff --git a/src/instance_switch.jl b/src/instance_switch.jl
@@ -0,0 +1,62 @@
+export switch, update_halo!
+
+# Once multi instance is enabled in an execution it wont be disabled
+# Same applies for single instance
+let
+	  global is_instance_number_defined, is_multi_instance, set_multi_instance, set_single_instance
+
+    undefined                        :: Bool = true
+    multi_instance                   :: Bool = false
+    is_instance_number_defined()     :: Bool = !undefined
+    is_multi_instance()              :: Bool = (if undefined error("Undefined instance policy,"
+                                                              * "has yet to be set to single or multi")
+                                           else multi_instance end)
+
+    set_multi_instance()                     = (undefined = false; multi_instance = true; nothing)
+    set_single_instance()                    = (undefined = false; multi_instance = false; nothing)
+
+
+    ### Below is an alternative way of interacting with the grids
+    # Its pretty much a globally accesible table of global grids to save the instantiated grids
+    # Using it is completely optional
+
+    global add_gg_to_table, gg_table_get, gg_table_erase
+
+    gg_table      :: Vector{GlobalGrid} = []
+    # Adds a global grid, returns its ID (position) in the table
+    add_gg_to_table(gg :: GlobalGrid) :: Int = (push!(gg_table, gg); length(gg_table))
+    # Returns a global grid at position ID
+    gg_table_get(id :: Int)    :: GlobalGrid = (if (length(gg_table) >= id > 0)
+                                                    gg_table[id]
+                                                else error("Bad global grid ID") end)
+    gg_table_erase(id::Int)    :: GlobalGrid = (if (length(gg_table) >= id > 0)
+                                                    gg_table[id] = GLOBAL_GRID_NULL
+                                                else error("Bad global grid ID") end)
+end
+
+"""
+  Changes the focused global grid and returns the previously focused one.
+"""
+function switch(global_grid::GlobalGrid)::GlobalGrid
+
+    if !is_multi_instance()
+        error("Illegal switch: this execution environment has been initialised in a single global instance regime")
+    end
+    gg = get_global_grid()
+    set_global_grid(global_grid)
+    if gg.nprocs <= 0
+        init_timing_functions()
+    end
+    return gg
+end
+
+"""
+  Additional dispatch that adds the posibility to specify a global grid to use.
+"""
+function update_halo!(global_grid_instance :: GlobalGrid, A::Union{GGArray, GGField, GGFieldConvertible}...)
+
+    old = switch(global_grid_instance)
+    update_halo!(A...)
+    switch(old)
+    return nothing
+end
diff --git a/test/test_multiple_ggrids.jl b/test/test_multiple_ggrids.jl
@@ -0,0 +1,68 @@
+push!(LOAD_PATH, "../src")
+using Test
+import MPI, CUDA, AMDGPU
+using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
+import ImplicitGlobalGrid: @require
+
+
+## Test setup (NOTE: Testset "2. initialization including MPI" completes the test setup as it initializes MPI and must therefore mandatorily be at the 2nd position). NOTE: these tests require nprocs == 1.
+p0 = MPI.PROC_NULL
+nx = 40;
+ny = 40;
+nz = 10;
+
+@testset "$(basename(@__FILE__))" begin
+    @testset "1. pre-MPI_Init-exception" begin
+        @require !GG.grid_is_initialized()
+        @test_throws ErrorException init_global_grid(nx, ny, nz, quiet=true, init_MPI=false);  # Error: init_MPI=false while MPI has not been initialized before.
+        @test !GG.grid_is_initialized()
+    end;
+
+    @testset "2. initialization including MPI" begin
+	      # ATTENTION TO THE INIT_MPI PARAMETER THE DEFAULT IS FALSE IN THIS INIT
+        global_grid_A = init_global_grid_instance(nx, ny, nz, dimx=1, dimy=1, dimz=1, quiet=true, init_MPI=true);
+
+        @testset "NOT initialized" begin
+            @test !GG.grid_is_initialized()
+        end
+
+        nullgrid = switch(global_grid_A)
+
+        @testset "get switched grid (NULL GRID) from switch" begin
+            @test nullgrid.nprocs <= 0
+        end
+
+        @testset "initialized" begin
+            @test GG.grid_is_initialized()
+            @test MPI.Initialized()
+        end;
+
+        @testset "values in global grid (A)" begin
+            @test GG.global_grid().nxyz_g    == [nx, ny, nz]
+            @test GG.global_grid().nxyz      == [nx, ny, nz]
+            @test GG.global_grid().overlaps  == [2, 2, 2]
+            @test GG.global_grid().halowidths== [1, 1, 1]
+            @test GG.global_grid().neighbors == [p0 p0 p0; p0 p0 p0]
+            @test GG.global_grid().periods   == [0, 0, 0]
+            @test GG.global_grid().disp      == 1
+            @test GG.global_grid().reorder   == 1
+            @test GG.global_grid().quiet     == true
+        end
+
+        global_grid_B = init_global_grid_instance(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodx=1, periodz=1, quiet=true, init_MPI=false)
+        switch(global_grid_B)
+
+        @testset "values in global grid (B)" begin
+            @test GG.global_grid().periods   == [1, 0, 1]
+        end
+        finalize_global_grid(finalize_MPI=false);
+
+        @testset "single instance cannot be called if multiple instance was called before" begin
+	          @test_throws ErrorException init_global_grid(nx, ny, nz, init_MPI = false)
+        end
+    end;
+
+end;
+
+## Test tear down
+MPI.Finalize()
diff --git a/test/test_multiple_ggrids_initwhensingle.jl b/test/test_multiple_ggrids_initwhensingle.jl
@@ -0,0 +1,31 @@
+push!(LOAD_PATH, "../src")
+using Test
+import MPI, CUDA, AMDGPU
+using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
+import ImplicitGlobalGrid: @require
+
+
+## Test setup (NOTE: Testset "2. initialization including MPI" completes the test setup as it initializes MPI and must therefore mandatorily be at the 2nd position). NOTE: these tests require nprocs == 1.
+p0 = MPI.PROC_NULL
+nx = 40;
+ny = 40;
+nz = 10;
+
+@testset "$(basename(@__FILE__))" begin
+    init_global_grid(nx, ny, nz, dimx=1, dimy=1, dimz=1, quiet=true, init_MPI=true)
+
+    @testset "initialized" begin
+        @test GG.grid_is_initialized()
+        @test MPI.Initialized()
+    end
+
+    @testset "multiple instance cannot be called if single instance was called before" begin
+        @test_throws ErrorException init_global_grid_instance(nx, ny, nz, dimx=1, dimy=1, dimz=1, periodx=1, periodz=1, quiet=true, init_MPI=false)
+    end
+
+    finalize_global_grid(finalize_MPI=false)
+
+end;
+
+## Test tear down
+MPI.Finalize()