From 900f4fd21010fddf34fc20d8afe754e3e6d2f49d Mon Sep 17 00:00:00 2001
From: Julian Samaroo <jpsamaroo@jpsamaroo.me>
Date: Mon, 6 Jul 2020 13:16:02 -0500
Subject: [PATCH] Changes for the package merge (#1)

* Update docs and README
---
 Manifest.toml               | 12 ++++++------
 Project.toml                |  4 ++--
 README.md                   | 32 ++++++++++++++++----------------
 deps/deps.jl                |  8 +++++++-
 deps/hsa/build.jl           |  3 ---
 deps/rocm-external/build.jl |  2 +-
 docs/make.jl                |  2 +-
 docs/src/index.md           | 29 +++++++++++++++++++++++++++++
 src/AMDGPU.jl               | 26 ++++++++++++++------------
 src/array.jl                |  8 ++++----
 src/device/gcn/hostcall.jl  |  4 ++--
 src/executable.jl           |  2 +-
 src/execution.jl            |  2 +-
 test/hsa/global.jl          |  1 -
 14 files changed, 84 insertions(+), 51 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 51e71634d..00adb361b 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -49,11 +49,11 @@ version = "2.0.1"
 
 [[GPUCompiler]]
 deps = ["DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "95aa07bfda5c80ccd57b038570ef79f663ce531f"
+git-tree-sha1 = "3226928dacf00271c95f8d4325cbae34d60878d5"
 repo-rev = "jps/gcn-workaround-allocas"
 repo-url = "https://github.com/JuliaGPU/GPUCompiler.jl.git"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.4.0"
+version = "0.5.0"
 
 [[InteractiveUtils]]
 deps = ["Markdown"]
@@ -61,9 +61,9 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[LLVM]]
 deps = ["CEnum", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "d9c6e1efcaa6c2fcd043da812a62b3e489a109a3"
+git-tree-sha1 = "a662366a5d485dee882077e8da3e1a95a86d097f"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "1.7.0"
+version = "2.0.0"
 
 [[Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
@@ -86,9 +86,9 @@ deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
 [[OrderedCollections]]
-git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3"
+git-tree-sha1 = "293b70ac1780f9584c89268a6e2a560d938a7065"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.2.0"
+version = "1.3.0"
 
 [[Printf]]
 deps = ["Unicode"]
diff --git a/Project.toml b/Project.toml
index 0ef5b5279..1fedcbe55 100644
--- a/Project.toml
+++ b/Project.toml
@@ -21,8 +21,8 @@ Adapt = "0.4, 1.0, 2.0"
 BinaryProvider = "0.5"
 CEnum = "0.2, 0.3, 0.4"
 GPUArrays = "2"
-GPUCompiler = "0.4"
-LLVM = "1.5"
+GPUCompiler = "0.4, 0.5"
+LLVM = "2"
 Requires = "1"
 Setfield = "0.5, 0.6"
 julia = "1.4"
diff --git a/README.md b/README.md
index 9a2345c00..fa3428c3c 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,19 @@
-# AMDGPUnative.jl
+# AMDGPU.jl
 
-*Support for compiling and executing native Julia kernels on AMD GPUs.*
+*AMD GPU (ROCm) programming in Julia*
 
 | **Documentation**                                                       | **Build Status**                                              |
 |:---------------------------------------:|:-------------------------------------------------------------:|
 | [![][docs-master-img]][docs-master-url] | [![][gitlab-img]][gitlab-url] [![][codecov-img]][codecov-url] |
 
-[gitlab-img]: https://gitlab.com/JuliaGPU/AMDGPUnative.jl/badges/master/pipeline.svg
-[gitlab-url]: https://gitlab.com/JuliaGPU/AMDGPUnative.jl/commits/master
+[gitlab-img]: https://gitlab.com/JuliaGPU/AMDGPU.jl/badges/master/pipeline.svg
+[gitlab-url]: https://gitlab.com/JuliaGPU/AMDGPU.jl/commits/master
 
-[codecov-img]: https://codecov.io/gh/JuliaGPU/AMDGPUnative.jl/branch/master/graph/badge.svg
-[codecov-url]: https://codecov.io/gh/JuliaGPU/AMDGPUnative.jl
+[codecov-img]: https://codecov.io/gh/JuliaGPU/AMDGPU.jl/branch/master/graph/badge.svg
+[codecov-url]: https://codecov.io/gh/JuliaGPU/AMDGPU.jl
 
 [docs-master-img]: https://img.shields.io/badge/docs-master-blue.svg
-[docs-master-url]: https://juliagpu.gitlab.io/AMDGPUnative.jl/
+[docs-master-url]: https://juliagpu.gitlab.io/AMDGPU.jl/
 
 
 
@@ -23,24 +23,24 @@ The package can be installed with the Julia package manager.
 From the Julia REPL, type `]` to enter the Pkg REPL mode and run:
 
 ```
-pkg> add AMDGPUnative
+pkg> add AMDGPU
 ```
 
 Or, equivalently, via the `Pkg` API:
 
 ```julia
-julia> import Pkg; Pkg.add("AMDGPUnative")
+julia> import Pkg; Pkg.add("AMDGPU")
 ```
 
 
 ## Project Status
 
-The package is tested against, and being developed for, Julia `1.3` and above.
+The package is tested against, and being developed for, Julia `1.4` and above.
 Only 64-bit Linux is supported and working at this time, until ROCm is ported
 to other platforms. It is recommended to use a version of Julia with LLVM 9.0
 or higher.  This package is under active maintenance and is reasonably
 complete, however not all features (and especially performance) are up to par
-with CUDAnative.
+with CUDA.jl.
 
 ### Supported Functionality
 
@@ -48,8 +48,8 @@ with CUDAnative.
 |:---|:---:|:---|
 | Host-side kernel launches | :heavy_check_mark: | See #58 |
 | Dynamic parallelism | :x: |
-| Local (shared) memory | :x: |
-| Coarse-grained memory | :x: |
+| Local (shared) memory | :heavy_check_mark: |
+| Coarse-grained memory | :heavy_check_mark: |
 | Page-locked (pinned) memory | :x: |
 
 ## Questions and Contributions
@@ -59,14 +59,14 @@ forum](https://discourse.julialang.org/c/domain/gpu) under the GPU domain and/or
 channel of the [Julia Slack](https://julialang.org/community/).
 
 Contributions are very welcome, as are feature requests and suggestions. Please open an
-[issue](https://github.com/JuliaGPU/AMDGPUnative.jl/issues) if you encounter any problems.
+[issue](https://github.com/JuliaGPU/AMDGPU.jl/issues) if you encounter any problems.
 
 ## Acknowledgment
 
-AMDGPUnative would not have been possible without the work by Tim Besard and
+AMDGPU would not have been possible without the work by Tim Besard and
 contributors to [CUDAnative.jl](https://github.com/JuliaGPU/CUDAnative.jl) and
 [LLVM.jl](https://github.com/maleadt/LLVM.jl).
 
 ## License
 
-AMDGPUnative.jl is licensed under the [MIT License](LICENSE.md).
+AMDGPU.jl is licensed under the [MIT License](LICENSE.md).
diff --git a/deps/deps.jl b/deps/deps.jl
index da543b85f..2a4d69434 100644
--- a/deps/deps.jl
+++ b/deps/deps.jl
@@ -4,6 +4,9 @@ const hsa_ext = joinpath(@__DIR__, "hsa", "ext.jl")
 if isfile(hsa_ext)
     include(hsa_ext)
 end
+if !isdefined(@__MODULE__, :hsa_configured)
+    const hsa_configured = false
+end
 if !hsa_configured
     const hsa_configured = false
     const libhsaruntime_version = v"0.0"
@@ -19,6 +22,9 @@ const libs_ext = joinpath(@__DIR__, "rocm-external", "ext.jl")
 if isfile(libs_ext)
     include(libs_ext)
 end
+if !isdefined(@__MODULE__, :ext_libs_configured)
+    const ext_libs_configured = false
+end
 if !ext_libs_configured
     # default (non-functional) values for critical variables,
     # making it possible to _load_ the package at all times.
@@ -30,4 +36,4 @@ if !ext_libs_configured
     const libmiopen = nothing
 end
 
-const configured = hsa_configured && ext_libs_configured
+const configured = hsa_configured
diff --git a/deps/hsa/build.jl b/deps/hsa/build.jl
index 2d6dd6022..8757799f1 100644
--- a/deps/hsa/build.jl
+++ b/deps/hsa/build.jl
@@ -146,9 +146,6 @@ function main()
     end
 
     # find the ld.lld program for linking kernels
-    # NOTE: This isn't needed by HSARuntime.jl directly, but other packages
-    # (like AMDGPUnative.jl) will want it to be available, so we find it for
-    # them
     ld_path = find_ld_lld()
     if ld_path == ""
         build_error("Couldn't find ld.lld.")
diff --git a/deps/rocm-external/build.jl b/deps/rocm-external/build.jl
index c5fa4c16e..883c09bcd 100644
--- a/deps/rocm-external/build.jl
+++ b/deps/rocm-external/build.jl
@@ -28,7 +28,7 @@ end
 
 ## main
 
-config_path = joinpath(@__DIR__, "ext.jl")
+const config_path = joinpath(@__DIR__, "ext.jl")
 const previous_config_path = config_path * ".bak"
 
 function write_ext(config)
diff --git a/docs/make.jl b/docs/make.jl
index dc99dd7db..d37937248 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,4 +1,4 @@
-using Documenter, AMDGPUnative
+using Documenter, AMDGPU
 
 makedocs(
     sitename="AMDGPU.jl",
diff --git a/docs/src/index.md b/docs/src/index.md
index 765eb5fea..337e38b9b 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -28,3 +28,32 @@ Julia support for programming AMD GPUs is currently provided by the [AMDGPU.jl p
 * An interface for compiling and running kernels written in Julia through LLVM's AMDGPU backend.
 * An array type implementing the [GPUArrays.jl](https://github.com/JuliaGPU/GPUArrays.jl) interface, providing high-level array operations.
 
+## Requirements
+* [ROCR](https://github.com/RadeonOpenCompute/ROCR-Runtime)
+* [ROCT](https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface)
+* Recent Linux kernel with AMDGPU and HSA enabled
+
+### Setup Instructions
+Currently, the requirements to get everything working properly is a bit poorly
+documented in the upstream docs for any distro other than Ubuntu.  So here is a
+list of requirements I've found through the process of making this work:
+
+Make sure /dev/kfd has a group other than root that you can add your user to.
+I recommend adding your user to the "video" group, and setting the
+ownership of /dev/kfd to root:video with 660 permissions.
+
+The correct libraries in your LD_LIBRARY_PATH or standard library locations:
+* libhsa-runtime64.so
+* libhsakmt.so
+
+In terms of Linux kernel versions, just pick the newest one you can. If
+building your own kernel, make sure all the regular AMDGPU and HSA options are
+enabled.
+
+You will also need `ld.lld` installed on your system (provided by LLVM/Clang);
+if you built Julia from source, you should have a copy somewhere in
+`deps/scratch/llvm-*/*/bin/` that you can add to your PATH.
+
+Once all of this is setup properly, you should be able to `] build AMDGPU`
+successfully; after that, if you have a supported GPU attached and enabled, `]
+test AMDGPU` should work exactly as you might expect.
diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
index c04701500..312b1ed43 100644
--- a/src/AMDGPU.jl
+++ b/src/AMDGPU.jl
@@ -100,21 +100,15 @@ atexit() do
     configured && HSA.shut_down()
 end
 function __init__()
-    # We want to always be able to load the package
-    if !configured
-        @warn("AMDGPU.jl has not been successfully built, and will not work properly.")
-        @warn("Please run Pkg.build(\"AMDGPU\") and restart Julia.")
-        return
-    end
-
-    # Try to load deps if possible
-    try
-        check_deps()
-    catch err
-        @warn """
+    deps_failed() = @warn """
         AMDGPU dependencies have not been built, some functionality may be missing.
         Please run Pkg.build("AMDGPU") and reload AMDGPU.
         """
+
+    # We want to always be able to load the package
+    if !configured
+        deps_failed()
+        return
     end
 
     # Make sure we load the library found by the last `] build`
@@ -130,6 +124,14 @@ function __init__()
         DEFAULT_AGENT[] = first(agents)
     end
 
+    try
+        # Try to load device libs if possible
+        check_deps()
+        @assert configured
+    catch err
+        deps_failed()
+    end
+
     # Load optional OpenCL integrations
     @require OpenCL="08131aa3-fb12-5dee-8b74-c09406e224a2" include("opencl.jl")
 
diff --git a/src/array.jl b/src/array.jl
index 07105de39..953512f92 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -152,13 +152,13 @@ take ownership of the memory, calling `free` when the array is no longer referen
 """
 function Base.unsafe_wrap(::Union{Type{ROCArray},Type{ROCArray{T}},Type{ROCArray{T,N}}},
                           p::Ptr{T}, dims::NTuple{N,Int};
-                          own::Bool=false, agent::HSAAgent=HSARuntime.get_default_agent()) where {T,N}
+                          own::Bool=false, agent::HSAAgent=get_default_agent()) where {T,N}
   buf = Mem.Buffer(convert(Ptr{Cvoid}, p), prod(dims) * sizeof(T), agent)
   return ROCArray{T, length(dims)}(buf, dims; own=own)
 end
 function Base.unsafe_wrap(Atype::Union{Type{ROCArray},Type{ROCArray{T}},Type{ROCArray{T,1}}},
                           p::Ptr{T}, dim::Integer;
-                          own::Bool=false, agent::HSAAgent=HSARuntime.get_default_agent()) where {T}
+                          own::Bool=false, agent::HSAAgent=get_default_agent()) where {T}
   unsafe_wrap(Atype, p, (dim,); own=own, agent=agent)
 end
 
@@ -221,14 +221,14 @@ Base.cconvert(::Type{Ptr{T}}, x::ROCArray{T}) where T = buffer(x)
 Base.cconvert(::Type{Ptr{Nothing}}, x::ROCArray) = buffer(x)
 
 
-## interop with AMDGPUnative
+## interop with AMDGPU
 
 function Base.convert(::Type{ROCDeviceArray{T,N,AS.Global}}, a::ROCArray{T,N}) where {T,N}
   ptr = Base.unsafe_convert(Ptr{T}, a.buf)
   ROCDeviceArray{T,N,AS.Global}(a.dims, DevicePtr{T,AS.Global}(ptr+a.offset))
 end
 
-Adapt.adapt_storage(::AMDGPUnative.Adaptor, xs::ROCArray{T,N}) where {T,N} =
+Adapt.adapt_storage(::AMDGPU.Adaptor, xs::ROCArray{T,N}) where {T,N} =
   convert(ROCDeviceArray{T,N,AS.Global}, xs)
 
 
diff --git a/src/device/gcn/hostcall.jl b/src/device/gcn/hostcall.jl
index dad0d6520..308ae9d0a 100644
--- a/src/device/gcn/hostcall.jl
+++ b/src/device/gcn/hostcall.jl
@@ -37,7 +37,7 @@ function HostCall(RT::Type, AT::Type{<:Tuple}, signal::S;
         buf_len += sizeof(T)
     end
     buf_len = max(sizeof(UInt64), buf_len) # make room for return buffer pointer
-    buf = Mem.alloc(agent, buf_len)
+    buf = Mem.alloc(agent, buf_len; coherent=true)
     buf_ptr = DevicePtr{UInt8,AS.Global}(Base.unsafe_convert(Ptr{UInt8}, buf))
     HostCall{S,RT,AT}(signal, host_sentinel, device_sentinel, buf_ptr, buf_len)
 end
@@ -213,7 +213,7 @@ function HostCall(func, rettype, argtypes; return_task=false,
                 @debug "Hostcall: Host function returning value of type $(typeof(ret))"
                 try
                     ret_len = sizeof(ret)
-                    ret_buf = Mem.alloc(agent, ret_len)
+                    ret_buf = Mem.alloc(agent, ret_len; coherent=true) # FIXME: Don't be coherent
                     ret_buf_ptr = Base.unsafe_convert(Ptr{typeof(ret)}, ret_buf)
                     Base.unsafe_store!(ret_buf_ptr, ret)
                     ret_buf_ptr = Base.unsafe_convert(Ptr{UInt64}, ret_buf)
diff --git a/src/executable.jl b/src/executable.jl
index 93932c1c2..38ee7f727 100644
--- a/src/executable.jl
+++ b/src/executable.jl
@@ -46,7 +46,7 @@ function HSAExecutable(agent::HSAAgent, data::Vector{UInt8}, symbol::String; glo
 
     _globals = Dict{Symbol,Any}()
     for (gbl,sz) in globals
-        gbl_buf = Mem.alloc(agent, sz)
+        gbl_buf = Mem.alloc(agent, sz; coherent=true)
         HSA.executable_agent_global_variable_define(executable[], agent.agent,
                                                     string(gbl), gbl_buf.ptr) |> check
         _globals[gbl] = gbl_buf
diff --git a/src/execution.jl b/src/execution.jl
index 1f3ec5545..5edd50bf7 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -257,7 +257,7 @@ AbstractKernel
     call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass)            if x[2]]
 
     # replace non-isbits arguments (they should be unused, or compilation would have failed)
-    # alternatively, make HSARuntime allow `launch` with non-isbits arguments.
+    # alternatively, allow `launch` with non-isbits arguments.
     for (i,dt) in enumerate(call_t)
         if !isbitstype(dt)
             call_t[i] = Ptr{Any}
diff --git a/test/hsa/global.jl b/test/hsa/global.jl
index 96f958428..49268a371 100644
--- a/test/hsa/global.jl
+++ b/test/hsa/global.jl
@@ -10,7 +10,6 @@ hk = AMDGPU.rocfunction(kernel, Tuple{Int32})
 exe = hk.mod.exe
 gbl = AMDGPU.get_global(exe.exe, :myglobal)
 gbl_ptr = Base.unsafe_convert(Ptr{Float32}, gbl.ptr)
-@show gbl_ptr
 @test Base.unsafe_load(gbl_ptr) == 0f0
 Base.unsafe_store!(gbl_ptr, 2f0)
 @test Base.unsafe_load(gbl_ptr) == 2f0