diff --git a/Manifest.toml b/Manifest.toml
index 7ff48e787..65b01ae3b 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -49,7 +49,9 @@ version = "5.1.0"
 
 [[GPUCompiler]]
 deps = ["DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "10b1a3aa52de30e9219f3ed147cb09e72cf6d2e8"
+git-tree-sha1 = "e0137fdb7c1d0fe217c39a5a3586a4e10a94ddda"
+repo-rev = "master"
+repo-url = "https://github.com/JuliaGPU/GPUCompiler.jl.git"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
 version = "0.7.0"
 
diff --git a/Project.toml b/Project.toml
index 55699cabc..a5ab38a70 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AMDGPU"
 uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>"]
-version = "0.1.2"
+version = "0.2.0"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
diff --git a/deps/deps.jl b/deps/deps.jl
index c6908e09e..780a95dc4 100644
--- a/deps/deps.jl
+++ b/deps/deps.jl
@@ -1,14 +1,13 @@
 # HSA runtime
 ## copied from CUDAdrv/src/CUDAdrv.jl
 const hsa_ext = joinpath(@__DIR__, "hsa", "ext.jl")
-if isfile(hsa_ext)
-    include(hsa_ext)
-end
-if !isdefined(@__MODULE__, :hsa_configured)
+if !isfile(hsa_ext)
+    @warn "Didn't find $hsa_ext, please build AMDGPU.jl"
     const hsa_configured = false
+else
+    include(hsa_ext)
 end
 if !hsa_configured
-    const hsa_configured = false
     const libhsaruntime_version = v"0.0"
     const libhsaruntime_vendor = "none"
     const libhsaruntime_path = nothing
@@ -19,11 +18,11 @@ const device_libs_path = joinpath(@__DIR__, "device-libs", "usr", "lib")
 
 # ROCm External Libraries
 const libs_ext = joinpath(@__DIR__, "rocm-external", "ext.jl")
-if isfile(libs_ext)
-    include(libs_ext)
-end
-if !isdefined(@__MODULE__, :ext_libs_configured)
+if !isfile(libs_ext)
+    @warn "Didn't find $libs_ext, please build AMDGPU.jl"
     const ext_libs_configured = false
+else
+    include(libs_ext)
 end
 if !ext_libs_configured
     # default (non-functional) values for critical variables,
diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
index 20546fdff..c10edbe20 100644
--- a/src/AMDGPU.jl
+++ b/src/AMDGPU.jl
@@ -22,10 +22,6 @@ export ROCArray, ROCVector, ROCMatrix, ROCVecOrMat
 export roc, roczeros, rocones, rocfill
 export HSAArray
 
-### Binary Dependencies ###
-
-include(joinpath(dirname(@__DIR__), "deps", "deps.jl"))
-
 ### HSA Runtime ###
 
 include(joinpath(@__DIR__, "hsa", "HSA.jl"))
@@ -84,40 +80,41 @@ include("array.jl")
 roc(xs) = adapt(ROCArray{Float32}, xs)
 allowscalar(x::Bool) = nothing
 
-### External Libraries ###
-
-# TODO: add check
-include("hip/HIP.jl")
-librocblas !== nothing     && include("blas/rocBLAS.jl")
-librocfft !== nothing      && include("fft/rocFFT.jl")
-#librocsparse !== nothing  && include("sparse/rocSPARSE.jl")
-#librocalution !== nothing && include("solver/rocALUTION.jl")
-#librocrand !== nothing    && include("rand/rocRAND.jl")
-#libmiopen !== nothing     && include("dnn/MIOpen.jl")
-
 ### Initialization and Shutdown ###
 
-atexit() do
-    configured && HSA.shut_down()
-end
 function __init__()
     deps_failed() = @warn """
         AMDGPU dependencies have not been built, some functionality may be missing.
         Please run Pkg.build("AMDGPU") and reload AMDGPU.
         """
 
+    # Load binary dependencies
+    include(joinpath(dirname(@__DIR__), "deps", "deps.jl"))
+
     # We want to always be able to load the package
     if !configured
         deps_failed()
         return
     end
 
+    # TODO: add check
+    include(joinpath(@__DIR__, "hip", "HIP.jl"))
+    librocblas !== nothing     && include(joinpath(@__DIR__, "blas", "rocBLAS.jl"))
+    librocfft !== nothing      && include(joinpath(@__DIR__, "fft", "rocFFT.jl"))
+    #librocsparse !== nothing  && include("sparse/rocSPARSE.jl")
+    #librocalution !== nothing && include("solver/rocALUTION.jl")
+    #librocrand !== nothing    && include("rand/rocRAND.jl")
+    #libmiopen !== nothing     && include("dnn/MIOpen.jl")
+
     # Make sure we load the library found by the last `] build`
     push!(Libdl.DL_LOAD_PATH, dirname(libhsaruntime_path))
     # TODO: Do the same (if possible) for the debug library
 
     # Initialize the HSA runtime
     HSA.init() |> check
+    atexit() do
+        configured && HSA.shut_down()
+    end
 
     # Populate the default agent
     agents = get_agents(:gpu)
diff --git a/src/array.jl b/src/array.jl
index 953512f92..849e0e1fa 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -1,128 +1,96 @@
-mutable struct ROCArray{T,N} <: AbstractArray{T,N}
-    size::Dims{N}
-    handle::Ptr{T}
-end
 
-const ROCVector{T} = ROCArray{T,1}
-const ROCMatrix{T} = ROCArray{T,2}
-const ROCVecOrMat{T} = Union{ROCVector{T},ROCMatrix{T}}
+#
+# Device functionality
+#
 
-# TODO: Support non-isbitstype allocations
-function ROCArray(agent::HSAAgent, ::Type{T}, size::NTuple{N,Int}) where {T,N}
-    @assert isbitstype(T) "$T is not a primitive type"
-    @assert all(x->x>0, size) "Invalid array size: $size"
-    region = get_region(agent, :finegrained)
-    nbytes = sizeof(T) * prod(size)
-    handle = Ref{Ptr{T}}()
-    HSA.memory_allocate(region[], nbytes, handle) |> check
-    arr = ROCArray{T,N}(size, handle[])
-    finalizer(arr) do arr
-        HSA.memory_free(arr.handle) |> check
-    end
-    return arr
-end
-ROCArray(::Type{T}, size::NTuple{N,Int}) where {T,N} =
-    ROCArray(DEFAULT_AGENT[], T, size)
+## execution
 
-function ROCArray(agent::HSAAgent, arr::Array{T,N}) where {T,N}
-    rarr = ROCArray(agent, T, size(arr))
-    for idx in eachindex(arr)
-        rarr[idx] = arr[idx]
-    end
-    return rarr
-end
-ROCArray(arr::Array{T,N}) where {T,N} =
-    ROCArray(DEFAULT_AGENT[], arr)
-
-function Array(rarr::ROCArray{T,N}) where {T,N}
-    arr = Array{T}(undef, size(rarr))
-    # FIXME: Use Mem
-    ref_arr = Ref(arr)
-    GC.@preserve ref_arr begin
-        ccall(:memcpy, Cvoid,
-            (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
-            ref_arr, rarr.handle, sizeof(arr))
-    end
-    return rarr
-end
+struct ROCArrayBackend <: AbstractGPUBackend end
 
-Base.pointer(arr::ROCArray) = arr.handle
-Base.cconvert(::Type{Ptr{T}}, x::ROCArray{T}) where T = x.handle
-Base.cconvert(::Type{Ptr{Nothing}}, x::ROCArray) = x.handle
-Base.IndexStyle(::Type{<:ROCArray}) = Base.IndexLinear()
-Base.IndexStyle(::ROCArray) = Base.IndexLinear()
-function Base.iterate(A::ROCArray, i=1) # copy-pasta from Base
-    Base.@_inline_meta
-    (i % UInt) - 1 < length(A) ? (@inbounds A[i], i + 1) : nothing
-end
-Base.similar(arr::ROCArray{T,N}) where {T,N} =
-    ROCArray(T, size(arr))
-Base.similar(agent::HSAAgent, arr::ROCArray{T,N}) where {T,N} =
-    ROCArray(agent, T, size(arr))
-Base.similar(arr::ROCArray{T1,N}, ::Type{T2}, dims::Dims) where {T1,N,T2} =
-    similar(DEFAULT_AGENT[], arr, T2, dims)
-function Base.similar(agent::HSAAgent, arr::ROCArray{T1,N}, ::Type{T2}, dims::Dims) where {T1,N,T2}
-    ROCArray(agent, T2, dims)
-end
+struct ROCKernelContext <: AbstractKernelContext end
 
-# copy-pasta from Base
-function Base.stride(arr::ROCArray, i::Int)
-    if i > ndims(arr)
-        return length(ar)
-    end
-    s = 1
-    for n = 1:(i-1)
-        s *= size(arr, n)
-    end
-    return s
+function GPUArrays.gpu_call(::ROCArrayBackend, f, args, threads::Int, blocks::Int;
+                            name::Union{String,Nothing})
+    groupsize, gridsize = threads, blocks*threads
+    wait(@roc groupsize=groupsize gridsize=gridsize f(ROCKernelContext(), args...))
 end
 
-Base.size(arr::ROCArray) = arr.size
-Base.length(arr::ROCArray) = prod(size(arr))
+## on-device
 
-function Base.fill!(arr::ROCArray{T,N}, value::T) where {T,N}
-    for idx in 1:length(arr)
-        arr[idx] = value
-    end
+# indexing
+
+for (f, froc) in (
+        (:blockidx, :blockIdx),
+        (:blockdim, :blockDim),
+        (:threadidx, :threadIdx),
+        (:griddim, :gridDimWG)
+    )
+    @eval GPUArrays.$f(::ROCKernelContext) = AMDGPU.$froc().x
 end
-@inline function Base.getindex(arr::ROCArray{T,N}, idx) where {T,N}
-    @boundscheck checkbounds(arr, idx)
-    Base.unsafe_load(pointer(arr), idx)::T
+
+# math
+
+@inline GPUArrays.cos(ctx::ROCKernelContext, x) = cos(x)
+@inline GPUArrays.sin(ctx::ROCKernelContext, x) = sin(x)
+@inline GPUArrays.sqrt(ctx::ROCKernelContext, x) = sqrt(x)
+@inline GPUArrays.log(ctx::ROCKernelContext, x) = log(x)
+
+# memory
+
+@inline function GPUArrays.LocalMemory(ctx::ROCKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}) where {T,dims,id}
+    ptr = AMDGPU.alloc_special(Val(id), T, AMDGPU.AS.Local, Val(prod(dims)))
+    ROCDeviceArray(dims, ptr)
 end
-@inline function Base.setindex!(arr::ROCArray{T,N}, value, idx) where {T,N}
-    @boundscheck checkbounds(arr, idx)
-    Base.unsafe_store!(pointer(arr), value, idx)
+
+# synchronization
+
+@inline function GPUArrays.synchronize_threads(::ROCKernelContext)
+    AMDGPU.sync_workgroup()
+    return
 end
 
-#=
 
-mutable struct ROCArray{T,N} <: GPUArray{T,N}
-  buf::Mem.Buffer
-  own::Bool
+#
+# Host abstractions
+#
+
 
-  dims::Dims{N}
-  offset::Int
+mutable struct ROCArray{T,N} <: AbstractGPUArray{T,N}
+    buf::Mem.Buffer
+    own::Bool
 
-  function ROCArray{T,N}(buf::Mem.Buffer, dims::Dims{N}; offset::Integer=0, own::Bool=true) where {T,N}
-    xs = new{T,N}(buf, own, dims, offset)
-    if own
-      Mem.retain(buf)
-      finalizer(unsafe_free!, xs)
+    dims::Dims{N}
+    offset::Int
+
+    function ROCArray{T,N}(buf::Mem.Buffer, dims::Dims{N}; offset::Integer=0, own::Bool=true) where {T,N}
+        @assert isbitstype(T) "ROCArray only supports bits types"
+        xs = new{T,N}(buf, own, dims, offset)
+        if own
+            Mem.retain(buf)
+            finalizer(unsafe_free!, xs)
+        end
+        return xs
     end
-    return xs
-  end
 end
 
 function unsafe_free!(xs::ROCArray)
-  Mem.release(xs.buf) && dealloc(xs.buf, prod(xs.dims)*sizeof(eltype(xs)))
-  return
+    Mem.release(xs.buf) && Mem.free(xs.buf)
+    return
 end
 
-## construction
+## aliases
+
+const ROCVector{T} = ROCArray{T,1}
+const ROCMatrix{T} = ROCArray{T,2}
+const ROCVecOrMat{T} = Union{ROCVector{T},ROCMatrix{T}}
+
+## constructors
 
 # type and dimensionality specified, accepting dims as tuples of Ints
-ROCArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} =
-  ROCArray{T,N}(alloc(prod(dims)*sizeof(T)), dims)
+function ROCArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N}
+    buf = Mem.alloc(prod(dims)*sizeof(T))
+    ROCArray{T,N}(buf, dims)
+end
 
 # type and dimensionality specified, accepting dims as series of Ints
 ROCArray{T,N}(::UndefInitializer, dims::Integer...) where {T,N} = ROCArray{T,N}(undef, dims)
@@ -130,39 +98,27 @@ ROCArray{T,N}(::UndefInitializer, dims::Integer...) where {T,N} = ROCArray{T,N}(
 # type but not dimensionality specified
 ROCArray{T}(::UndefInitializer, dims::Dims{N}) where {T,N} = ROCArray{T,N}(undef, dims)
 ROCArray{T}(::UndefInitializer, dims::Integer...) where {T} =
-  ROCArray{T}(undef, convert(Tuple{Vararg{Int}}, dims))
+    ROCArray{T}(undef, convert(Tuple{Vararg{Int}}, dims))
+
+# from Base arrays
+function ROCArray{T,N}(x::Array{T,N}, dims::Dims{N}) where {T,N}
+    r = ROCArray{T,N}(undef, size(x))
+    Mem.upload!(r.buf, pointer(x), sizeof(x))
+    return r
+end
+
+# type as first argument
+# FIXME: Remove me!
+#ROCArray(::Type{T}, dims::Dims{N}) where {T,N} = ROCArray{T,N}(undef, dims)
 
 # empty vector constructor
 ROCArray{T,1}() where {T} = ROCArray{T,1}(undef, 0)
 
-
 Base.similar(a::ROCArray{T,N}) where {T,N} = ROCArray{T,N}(undef, size(a))
 Base.similar(a::ROCArray{T}, dims::Base.Dims{N}) where {T,N} = ROCArray{T,N}(undef, dims)
 Base.similar(a::ROCArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} = ROCArray{T,N}(undef, dims)
 
 
-"""
-  unsafe_wrap(::ROCArray, pointer{T}, dims; own=false, agent::HSAAgent)
-
-Wrap a `ROCArray` object around the data at the address given by `pointer`. The pointer
-element type `T` determines the array element type. `dims` is either an integer (for a 1d
-array) or a tuple of the array dimensions. `own` optionally specified whether Julia should
-take ownership of the memory, calling `free` when the array is no longer referenced. The
-`agent` argument determines the ROC agent where the data is allocated in.
-"""
-function Base.unsafe_wrap(::Union{Type{ROCArray},Type{ROCArray{T}},Type{ROCArray{T,N}}},
-                          p::Ptr{T}, dims::NTuple{N,Int};
-                          own::Bool=false, agent::HSAAgent=get_default_agent()) where {T,N}
-  buf = Mem.Buffer(convert(Ptr{Cvoid}, p), prod(dims) * sizeof(T), agent)
-  return ROCArray{T, length(dims)}(buf, dims; own=own)
-end
-function Base.unsafe_wrap(Atype::Union{Type{ROCArray},Type{ROCArray{T}},Type{ROCArray{T,1}}},
-                          p::Ptr{T}, dim::Integer;
-                          own::Bool=false, agent::HSAAgent=get_default_agent()) where {T}
-  unsafe_wrap(Atype, p, (dim,); own=own, agent=agent)
-end
-
-
 ## array interface
 
 Base.elsize(::Type{<:ROCArray{T}}) where {T} = sizeof(T)
@@ -171,14 +127,10 @@ Base.size(x::ROCArray) = x.dims
 Base.sizeof(x::ROCArray) = Base.elsize(x) * length(x)
 
 
-## interop with other arrays
-
-ROCArray{T,N}(xs::AbstractArray{T,N}) where {T,N} =
-  isbits(xs) ?
-    (ROCArray{T,N}(undef, size(xs)) .= xs) :
-    copyto!(ROCArray{T,N}(undef, size(xs)), collect(xs))
+## interop with Julia arrays
 
-ROCArray{T,N}(xs::AbstractArray{S,N}) where {T,N,S} = ROCArray{T,N}((x -> T(x)).(xs))
+ROCArray{T,N}(x::AbstractArray{S,N}) where {T,N,S} =
+    ROCArray{T,N}(convert(Array{T}, x), size(x))
 
 # underspecified constructors
 ROCArray{T}(xs::AbstractArray{S,N}) where {T,N,S} = ROCArray{T,N}(xs)
@@ -193,141 +145,116 @@ ROCArray{T,N}(xs::ROCArray{T,N}) where {T,N} = xs
 
 Base.convert(::Type{T}, x::T) where T <: ROCArray = x
 
-function Base._reshape(parent::ROCArray, dims::Dims)
-  n = length(parent)
-  prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims"))
-  return ROCArray{eltype(parent),length(dims)}(parent.buf, dims;
-                                              offset=parent.offset, own=parent.own)
-end
-
 
+## broadcast
 
-## interop with C libraries
+using Base.Broadcast: BroadcastStyle, Broadcasted
 
-"""
-  buffer(array::ROCArray [, index])
+struct ROCArrayStyle{N} <: AbstractGPUArrayStyle{N} end
+ROCArrayStyle(::Val{N}) where N = ROCArrayStyle{N}()
+ROCArrayStyle{M}(::Val{N}) where {N,M} = ROCArrayStyle{N}()
 
-Get the native address of a ROCArray, optionally at a given location `index`.
-Equivalent of `Base.pointer` on `Array`s.
-"""
-function buffer(xs::ROCArray, index=1)
-  extra_offset = (index-1) * Base.elsize(xs)
-  Mem.Buffer(xs.buf.ptr + xs.offset + extra_offset,
-             sizeof(xs) - extra_offset,
-             xs.buf.agent)
-end
+BroadcastStyle(::Type{ROCArray{T,N}}) where {T,N} = ROCArrayStyle{N}()
 
-Base.cconvert(::Type{Ptr{T}}, x::ROCArray{T}) where T = buffer(x)
-Base.cconvert(::Type{Ptr{Nothing}}, x::ROCArray) = buffer(x)
+# Allocating the output container
+Base.similar(bc::Broadcasted{ROCArrayStyle{N}}, ::Type{T}) where {N,T} =
+    similar(ROCArray{T}, axes(bc))
+Base.similar(bc::Broadcasted{ROCArrayStyle{N}}, ::Type{T}, dims...) where {N,T} =
+    ROCArray{T}(undef, dims...)
 
 
-## interop with AMDGPU
+## memory operations
 
-function Base.convert(::Type{ROCDeviceArray{T,N,AS.Global}}, a::ROCArray{T,N}) where {T,N}
-  ptr = Base.unsafe_convert(Ptr{T}, a.buf)
-  ROCDeviceArray{T,N,AS.Global}(a.dims, DevicePtr{T,AS.Global}(ptr+a.offset))
+function Base.copyto!(dest::Array{T}, d_offset::Integer,
+                      source::ROCArray{T}, s_offset::Integer,
+                      amount::Integer) where T
+    @boundscheck checkbounds(dest, d_offset+amount-1)
+    @boundscheck checkbounds(source, s_offset+amount-1)
+    Mem.download!(pointer(dest, d_offset),
+                  Mem.view(source.buf, (s_offset-1)*sizeof(T)),
+                  amount*sizeof(T))
+    dest
+end
+function Base.copyto!(dest::ROCArray{T}, d_offset::Integer,
+                      source::Array{T}, s_offset::Integer,
+                      amount::Integer) where T
+    @boundscheck checkbounds(dest, d_offset+amount-1)
+    @boundscheck checkbounds(source, s_offset+amount-1)
+    Mem.upload!(Mem.view(dest.buf, (d_offset-1)*sizeof(T)),
+                pointer(source, s_offset),
+                amount*sizeof(T))
+    dest
+end
+function Base.copyto!(dest::ROCArray{T}, d_offset::Integer,
+                      source::ROCArray{T}, s_offset::Integer,
+                      amount::Integer) where T
+    @boundscheck checkbounds(dest, d_offset+amount-1)
+    @boundscheck checkbounds(source, s_offset+amount-1)
+    Mem.transfer!(Mem.view(dest.buf, (d_offset-1)*sizeof(T)),
+                  Mem.view(source.buf, (s_offset-1)*sizeof(T)),
+                  amount*sizeof(T))
+    dest
 end
 
-Adapt.adapt_storage(::AMDGPU.Adaptor, xs::ROCArray{T,N}) where {T,N} =
-  convert(ROCDeviceArray{T,N,AS.Global}, xs)
-
-
-
-## interop with CPU array
-
-# We don't convert isbits types in `adapt`, since they are already
-# considered GPU-compatible.
-
-Adapt.adapt_storage(::Type{<:ROCArray}, xs::AbstractArray) =
-  isbits(xs) ? xs : convert(ROCArray, xs)
-
-Adapt.adapt_storage(::Type{<:ROCArray{T}}, xs::AbstractArray{<:Real}) where T <: AbstractFloat =
-  isbits(xs) ? xs : convert(ROCArray{T}, xs)
+# TODO: Workaround for hanging copy() broadcast kernel
+function Base.copy(X::ROCArray{T}) where T
+    Xnew = ROCArray{T}(undef, size(X))
+    copyto!(Xnew, 1, X, 1, length(X))
+    Xnew
+end
 
-Adapt.adapt_storage(::Type{<:Array}, xs::ROCArray) = convert(Array, xs)
+## fft
 
-Base.collect(x::ROCArray{T,N}) where {T,N} = copyto!(Array{T,N}(undef, size(x)), x)
+#=
+using AbstractFFTs
 
-function Base.unsafe_copyto!(dest::ROCArray{T}, doffs, src::Array{T}, soffs, n) where T
-  Mem.upload!(buffer(dest, doffs), pointer(src, soffs), n*sizeof(T))
-  return dest
-end
+# defining our own plan type is the easiest way to pass around the plans in FFTW interface
+# without ambiguities
 
-function Base.unsafe_copyto!(dest::Array{T}, doffs, src::ROCArray{T}, soffs, n) where T
-  Mem.download!(pointer(dest, doffs), buffer(src, soffs), n*sizeof(T))
-  return dest
+struct FFTPlan{T}
+    p::T
 end
 
-function Base.unsafe_copyto!(dest::ROCArray{T}, doffs, src::ROCArray{T}, soffs, n) where T
-  Mem.transfer!(buffer(dest, doffs), buffer(src, soffs), n*sizeof(T))
-  return dest
-end
+AbstractFFTs.plan_fft(A::ROCArray; kw_args...) = FFTPlan(plan_fft(A.data; kw_args...))
+AbstractFFTs.plan_fft!(A::ROCArray; kw_args...) = FFTPlan(plan_fft!(A.data; kw_args...))
+AbstractFFTs.plan_bfft!(A::ROCArray; kw_args...) = FFTPlan(plan_bfft!(A.data; kw_args...))
+AbstractFFTs.plan_bfft(A::ROCArray; kw_args...) = FFTPlan(plan_bfft(A.data; kw_args...))
+AbstractFFTs.plan_ifft!(A::ROCArray; kw_args...) = FFTPlan(plan_ifft!(A.data; kw_args...))
+AbstractFFTs.plan_ifft(A::ROCArray; kw_args...) = FFTPlan(plan_ifft(A.data; kw_args...))
 
-function Base.deepcopy_internal(x::ROCArray, dict::IdDict)
-  haskey(dict, x) && return dict[x]::typeof(x)
-  return dict[x] = copy(x)
+function Base.:(*)(plan::FFTPlan, A::ROCArray)
+    x = plan.p * A.data
+    ROCArray(x)
 end
+=#
 
 
-## utilities
+## GPUArrays interfaces
 
-roc(xs) = adapt(ROCArray{Float32}, xs)
-Base.getindex(::typeof(roc), xs...) = ROCArray([xs...])
+GPUArrays.device(x::ROCArray) = x.buf.agent
 
-roczeros(T::Type, dims...) = fill!(ROCArray{T}(undef, dims...), 0)
-rocones(T::Type, dims...) = fill!(ROCArray{T}(undef, dims...), 1)
-roczeros(dims...) = roczeros(Float32, dims...)
-rocones(dims...) = rocones(Float32, dims...)
-rocfill(v, dims...) = fill!(ROCArray{typeof(v)}(undef, dims...), v)
-rocfill(v, dims::Dims) = fill!(ROCArray{typeof(v)}(undef, dims...), v)
+GPUArrays.backend(::Type{<:ROCArray}) = ROCArrayBackend()
 
-# optimized implementation of `fill!` for types that are directly supported by memset
-const MemsetTypes = Dict(1=>UInt8, 2=>UInt16, 4=>UInt32)
-const MemsetCompatTypes = Union{UInt8, Int8,
-                                UInt16, Int16, Float16,
-                                UInt32, Int32, Float32}
-function Base.fill!(A::ROCArray{T}, x) where T <: MemsetCompatTypes
-  y = reinterpret(MemsetTypes[sizeof(T)], convert(T, x))
-  Mem.set!(buffer(A), y, length(A))
-  A
+function Base.convert(::Type{ROCDeviceArray{T,N,AS.Global}}, a::ROCArray{T,N}) where {T,N}
+    ptr = Base.unsafe_convert(Ptr{T}, a.buf)
+    ROCDeviceArray{T,N,AS.Global}(a.dims, AMDGPU.DevicePtr{T,AS.Global}(ptr+a.offset))
 end
+Adapt.adapt_storage(::AMDGPU.Adaptor, x::ROCArray{T,N}) where {T,N} =
+    convert(ROCDeviceArray{T,N,AS.Global}, x)
 
-
-## generic linear algebra routines
-
-function LinearAlgebra.tril!(A::ROCMatrix{T}, d::Integer = 0) where T
-  function kernel!(_A, _d)
-    li = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    m, n = size(_A)
-    if 0 < li <= m*n
-      i, j = Tuple(CartesianIndices(_A)[li])
-      if i < j - _d
-        _A[i, j] = 0
-      end
-    end
-    return nothing
-  end
-
-  grid, group = rocdims(A)
-  @roc gridsize=grid groupsize=group kernel!(A, d)
-  return A
+function GPUArrays.unsafe_reinterpret(::Type{T}, A::ROCArray, size::NTuple{N, Integer}; own=A.own) where {T, N}
+    ptr = convert(AMDGPU.DevicePtr{T,AS.Global}, A.buf.ptr)
+    buf = Mem.Buffer(ptr, A.buf.bytesize, A.buf.agent, A.buf.coherent)
+    ROCArray{T,N}(buf, size; offset=A.offset, own=own)
 end
+Base.unsafe_convert(::Type{Ptr{T}}, x::ROCArray{T}) where T =
+    Base.unsafe_convert(Ptr{T}, x.buf)
 
-function LinearAlgebra.triu!(A::ROCMatrix{T}, d::Integer = 0) where T
-  function kernel!(_A, _d)
-    li = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    m, n = size(_A)
-    if 0 < li <= m*n
-      i, j = Tuple(CartesianIndices(_A)[li])
-      if j < i + _d
-        _A[i, j] = 0
-      end
+#=
+function GPUArrays.mapreducedim!(f, op, R::ROCArray, A::AbstractArray, init=nothing)
+    if init !== nothing
+        fill!(R, init)
     end
-    return nothing
-  end
-
-  grid, group = rocdims(A)
-  @roc gridsize=grid groupsize=group kernel!(A, d)
-  return A
+    @allowscalar Base.mapreducedim!(f, op, R.data, A)
 end
 =#
diff --git a/src/blas/highlevel.jl b/src/blas/highlevel.jl
index af2c543d1..d67957256 100644
--- a/src/blas/highlevel.jl
+++ b/src/blas/highlevel.jl
@@ -43,7 +43,7 @@ function LinearAlgebra.BLAS.dotu(DX::ROCArray{T}, DY::ROCArray{T}) where T<:Unio
     dotu(n, DX, 1, DY, 1)
 end
 
-#LinearAlgebra.norm(x::ROCBLASArray) = nrm2(x)
+# FIXME: LinearAlgebra.norm(x::ROCBLASArray) = nrm2(x)
 LinearAlgebra.BLAS.asum(x::ROCBLASArray) = asum(length(x), x, 1)
 
 function LinearAlgebra.axpy!(alpha::Number, x::ROCArray{T}, y::ROCArray{T}) where T<:ROCBLASFloat
@@ -51,8 +51,10 @@ function LinearAlgebra.axpy!(alpha::Number, x::ROCArray{T}, y::ROCArray{T}) wher
     axpy!(length(x), convert(T,alpha), x, 1, y, 1)
 end
 
+#= FIXME
 Base.argmin(xs::ROCBLASArray{<:ROCBLASReal}) = iamin(xs)
 Base.argmax(xs::ROCBLASArray{<:ROCBLASReal}) = iamax(xs)
+=#
 
 
 
diff --git a/src/blas/librocblas.jl b/src/blas/librocblas.jl
index 3f109f7aa..456525ebd 100644
--- a/src/blas/librocblas.jl
+++ b/src/blas/librocblas.jl
@@ -64,7 +64,7 @@ function rocblas_dscal(handle, n, alpha::Cdouble, x::ROCArray, incx)
         @check ccall((:rocblas_dscal, "librocblas"),
                      rocblas_status_t,
                      (rocblas_handle, rocblas_int, Ptr{Cdouble}, Ptr{Cdouble}, rocblas_int),
-                     handle, n, ref_alpha, x.handle, incx)
+                     handle, n, ref_alpha, pointer(x), incx)
     end
 end
 function rocblas_sscal(handle, n, alpha::Cfloat, x::ROCArray, incx)
@@ -73,7 +73,7 @@ function rocblas_sscal(handle, n, alpha::Cfloat, x::ROCArray, incx)
         @check ccall((:rocblas_sscal, "librocblas"),
                      rocblas_status_t,
                      (rocblas_handle, rocblas_int, Ptr{Cfloat}, Ptr{Cfloat}, rocblas_int),
-                     handle, n, ref_alpha, x.handle, incx)
+                     handle, n, ref_alpha, pointer(x), incx)
     end
 end
 
@@ -81,39 +81,39 @@ function rocblas_dcopy(handle, n, x::ROCArray, incx, y::ROCArray, incy)
     @check ccall((:rocblas_dcopy, "librocblas"),
                  rocblas_status_t,
                  (rocblas_handle, rocblas_int, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int),
-                 handle, n, x.handle, incx, y.handle, incy)
+                 handle, n, pointer(x), incx, pointer(y), incy)
 end
 function rocblas_scopy(handle, n, x::ROCArray, incx, y::ROCArray, incy)
     @check ccall((:rocblas_scopy, "librocblas"),
                  rocblas_status_t,
                  (rocblas_handle, rocblas_int, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int),
-                 handle, n, x.handle, incx, y.handle, incy)
+                 handle, n, pointer(x), incx, pointer(y), incy)
 end
 
 function rocblas_ddot(handle, n, x::ROCArray, incx, y::ROCArray, incy, result)
     @check ccall((:rocblas_ddot, "librocblas"),
                  rocblas_status_t,
                  (rocblas_handle, rocblas_int, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}),
-                 handle, n, x.handle, incx, y.handle, incy, result)
+                 handle, n, pointer(x), incx, pointer(y), incy, result)
 end
 function rocblas_sdot(handle, n, x::ROCArray, incx, y::ROCArray, incy, result)
     @check ccall((:rocblas_sdot, "librocblas"),
                  rocblas_status_t,
                  (rocblas_handle, rocblas_int, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}),
-                 handle, n, x.handle, incx, y.handle, incy, result)
+                 handle, n, pointer(x), incx, pointer(y), incy, result)
 end
 
 function rocblas_dswap(handle, n, x::ROCArray, incx, y::ROCArray, incy)
     @check ccall((:rocblas_dswap, "librocblas"),
                  rocblas_status_t,
                  (rocblas_handle, rocblas_int, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble}, rocblas_int),
-                 handle, n, x.handle, incx, y.handle, incy)
+                 handle, n, pointer(x), incx, pointer(y), incy)
 end
 function rocblas_sswap(handle, n, x::ROCArray, incx, y::ROCArray, incy)
     @check ccall((:rocblas_sswap, "librocblas"),
                  rocblas_status_t,
                  (rocblas_handle, rocblas_int, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat}, rocblas_int),
-                 handle, n, x.handle, incx, y.handle, incy)
+                 handle, n, pointer(x), incx, pointer(y), incy)
 end
 
 ## Level 2 BLAS
@@ -125,7 +125,7 @@ function rocblas_dgemv(handle, trans::rocblas_operation_t, m::rocblas_int, n::ro
         @check ccall((:rocblas_dgemv, "librocblas"),
                      rocblas_status_t,
                      (rocblas_handle, rocblas_operation_t, rocblas_int, rocblas_int, Ptr{Cdouble}, Ptr{Cdouble}, rocblas_int, Ptr{Cdouble},rocblas_int, Ptr{Cdouble}, Ptr{Cdouble}, rocblas_int),
-                     handle, trans, m, n, ref_alpha, A.handle, lda, x.handle, incx, ref_beta, y.handle, incy)
+                     handle, trans, m, n, ref_alpha, pointer(A), lda, pointer(x), incx, ref_beta, pointer(y), incy)
     end
 end
 function rocblas_sgemv(handle, trans::rocblas_operation_t, m::rocblas_int, n::rocblas_int, alpha::Cfloat, A::ROCMatrix, lda::rocblas_int, x::ROCVector, incx::rocblas_int, beta::Cfloat, y::ROCVector, incy::rocblas_int)
@@ -135,6 +135,6 @@ function rocblas_sgemv(handle, trans::rocblas_operation_t, m::rocblas_int, n::ro
         @check ccall((:rocblas_sgemv, "librocblas"),
                      rocblas_status_t,
                      (rocblas_handle, rocblas_operation_t, rocblas_int, rocblas_int, Ptr{Cfloat}, Ptr{Cfloat}, rocblas_int, Ptr{Cfloat},rocblas_int, Ptr{Cfloat}, Ptr{Cfloat}, rocblas_int),
-                     handle, trans, m, n, ref_alpha, A.handle, lda, x.handle, incx, ref_beta, y.handle, incy)
+                     handle, trans, m, n, ref_alpha, pointer(A), lda, pointer(x), incx, ref_beta, pointer(y), incy)
     end
 end
diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl
index d4e50e38d..cbc8d06f3 100644
--- a/src/blas/wrappers.jl
+++ b/src/blas/wrappers.jl
@@ -237,6 +237,7 @@ function axpy!(alpha::Ta,
     y
 end
 
+#= FIXME
 ## iamax
 # TODO: fix iamax in julia base
 for (fname, elty) in ((:rocblasIdamax,:Float64),
@@ -278,6 +279,7 @@ for (fname, elty) in ((:rocblasIdamin,:Float64),
     end
 end
 iamin(dx::ROCArray) = iamin(length(dx), dx, 1)
+=#
 
 # Level 2
 ## mv
diff --git a/src/device/array.jl b/src/device/array.jl
index 1c7bb7b56..fe921872b 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -63,7 +63,6 @@ Base.length(g::ROCDeviceArray) = prod(g.shape)
 Base.unsafe_convert(::Type{DevicePtr{T,A}}, a::ROCDeviceArray{T,N,A}) where {T,A,N} = pointer(a)
 
 # indexing
-# FIXME: Boundschecking
 
 @inline function Base.getindex(A::ROCDeviceArray{T}, index::Integer) where {T}
     @boundscheck checkbounds(A, index)
@@ -78,6 +77,8 @@ end
     return A
 end
 
+Base.IndexStyle(::Type{<:ROCDeviceArray}) = Base.IndexLinear()
+
 # other
 
 Base.show(io::IO, a::ROCDeviceVector) =
diff --git a/src/device/gcn/indexing.jl b/src/device/gcn/indexing.jl
index fad70e9b1..7a6f65e04 100644
--- a/src/device/gcn/indexing.jl
+++ b/src/device/gcn/indexing.jl
@@ -110,7 +110,7 @@ for (dim,off) in ((:x,1), (:y,2), (:z,3))
     # Grid dimension (in workgroups)
     fn_wg = Symbol("gridDimWG_$dim")
     fn_wi_idx = Symbol("workitemIdx_$dim")
-    @eval @inline $fn_wg() = $fn()/$fn_wi_idx()
+    @eval @inline $fn_wg() = div($fn(), $fn_wi_idx())
 end
 
 """
diff --git a/src/fft/fft.jl b/src/fft/fft.jl
index 8c4743078..c5c6fc18d 100644
--- a/src/fft/fft.jl
+++ b/src/fft/fft.jl
@@ -220,7 +220,7 @@ function create_plan(xtype::rocfft_transform_type, xdims, T, inplace, region)
     end
     rocfft_plan_get_work_buffer_size(handle_ref[], worksize_ref)
     # TODO allow empty array in ../array.jl
-    workarea = worksize_ref[]>0 ? ROCArray(Int8, (Int(worksize_ref[]),)) : ROCArray(Int8, (1,))
+    workarea = worksize_ref[]>0 ? ROCArray{Int8}(undef, (Int(worksize_ref[]),)) : ROCArray{Int8}(undef, (1,))
     return handle_ref[], workarea
 end
 
@@ -275,7 +275,7 @@ end
 # FIXME: plan_inv methods allocate needlessly (to provide type parameters and normalization function)
 # Perhaps use FakeArray types to avoid this.
 function plan_inv(p::cROCFFTPlan{T,ROCFFT_FORWARD,inplace,N}) where {T<:rocfftComplexes,N,inplace}
-    X = ROCArray(T, p.sz)
+    X = ROCArray{T}(undef, p.sz)
     xtype = rocfft_transform_type_complex_inverse
     pp = create_plan(xtype, p.sz, T, inplace, p.region)
     ScaledPlan(cROCFFTPlan{T,ROCFFT_INVERSE,inplace,N}(pp..., X, p.sz, xtype, p.region),
@@ -283,7 +283,7 @@ function plan_inv(p::cROCFFTPlan{T,ROCFFT_FORWARD,inplace,N}) where {T<:rocfftCo
 end
 
 function plan_inv(p::cROCFFTPlan{T,ROCFFT_INVERSE,inplace,N}) where {T<:rocfftComplexes,N,inplace}
-    X = ROCArray(T, p.sz)
+    X = ROCArray{T}(undef, p.sz)
     xtype = rocfft_transform_type_complex_forward
     pp = create_plan(xtype, p.sz, T, inplace, p.region)
     ScaledPlan(cROCFFTPlan{T,ROCFFT_FORWARD,inplace,N}(pp..., X, p.sz, xtype, p.region),
@@ -291,8 +291,8 @@ function plan_inv(p::cROCFFTPlan{T,ROCFFT_INVERSE,inplace,N}) where {T<:rocfftCo
 end
 
 function plan_inv(p::rROCFFTPlan{T,ROCFFT_FORWARD,inplace,N}) where {T<:rocfftReals,N,inplace}
-    X = ROCArray(complex(T), p.osz)
-    Y = ROCArray(T, p.sz)
+    X = ROCArray{complex(T)}(undef, p.osz)
+    Y = ROCArray{T}(undef, p.sz)
     xtype = rocfft_transform_type_real_inverse
     pp = create_plan(xtype, p.sz, T, inplace, p.region)
     scale = normalization(Y, p.region)
@@ -300,7 +300,7 @@ function plan_inv(p::rROCFFTPlan{T,ROCFFT_FORWARD,inplace,N}) where {T<:rocfftRe
 end
 
 function plan_inv(p::rROCFFTPlan{T,ROCFFT_INVERSE,inplace,N}) where {T<:rocfftComplexes,N,inplace}
-    X = ROCArray(real(T), p.osz)
+    X = ROCArray{real(T)}(undef, p.osz)
     xtype = rocfft_transform_type_real_forward
     pp = create_plan(xtype, p.osz, T, inplace, p.region)
     scale = normalization(X, p.region)
@@ -332,24 +332,24 @@ function assert_applicable(p::ROCFFTPlan{T,K}, X::ROCArray{T}, Y::ROCArray{Ty})
 end
 
 function unsafe_execute!(plan::cROCFFTPlan{T,K,true,N}, X::ROCArray{T,N}) where {T,K,N}
-    rocfft_execute(plan, [X.handle,], C_NULL, plan.execution_info)
+    rocfft_execute(plan, [pointer(X),], C_NULL, plan.execution_info)
 end
 
 function unsafe_execute!(plan::cROCFFTPlan{T,K,false,N}, X::ROCArray{T,N}, Y::ROCArray{T}) where {T,N,K}
     Xcopy = copy(X) # since input array can also be modified
-    rocfft_execute(plan, [Xcopy.handle,], [Y.handle,], plan.execution_info)
+    rocfft_execute(plan, [pointer(Xcopy),], [pointer(Y),], plan.execution_info)
 end
 
 function unsafe_execute!(plan::rROCFFTPlan{T,ROCFFT_FORWARD,false,N}, X::ROCArray{T,N}, Y::ROCArray{<:rocfftComplexes,N}) where {T<:rocfftReals,N}
     @assert plan.xtype == rocfft_transform_type_real_forward
     Xcopy = copy(X)
-    rocfft_execute(plan, [Xcopy.handle,], [Y.handle,], plan.execution_info)
+    rocfft_execute(plan, [pointer(Xcopy),], [pointer(Y),], plan.execution_info)
 end
 
 function unsafe_execute!(plan::rROCFFTPlan{T,ROCFFT_INVERSE,false,N}, X::ROCArray{T,N}, Y::ROCArray{<:rocfftReals,N}) where {T<:rocfftComplexes,N}
     @assert plan.xtype == rocfft_transform_type_real_inverse
     Xcopy = copy(X)
-    rocfft_execute(plan, [Xcopy.handle,], [Y.handle,], plan.execution_info)
+    rocfft_execute(plan, [pointer(Xcopy),], [pointer(Y),], plan.execution_info)
 end
 
 
@@ -369,18 +369,18 @@ function Base.:(*)(p::cROCFFTPlan{T,K,true,N}, x::ROCArray{T,N}) where {T,K,N}
 end
 
 function Base.:(*)(p::cROCFFTPlan{T,K,false,N}, x::ROCArray{T,N}) where {T,K,N}
-    y = ROCArray(T, p.osz)
+    y = ROCArray{T}(undef, p.osz)
     mul!(y, p, x)
 end
 
 function Base.:(*)(p::rROCFFTPlan{T,ROCFFT_FORWARD,false,N}, x::ROCArray{T,N}) where {T<:rocfftReals,N}
     @assert p.xtype == rocfft_transform_type_real_forward
-    y = ROCArray(complex(T), p.osz)
+    y = ROCArray{complex(T)}(undef, p.osz)
     mul!(y, p, x)
 end
 
 function Base.:(*)(p::rROCFFTPlan{T,ROCFFT_INVERSE,false,N}, x::ROCArray{T,N}) where {T<:rocfftComplexes,N}
     @assert p.xtype == rocfft_transform_type_real_inverse
-    y = ROCArray(real(T), p.osz)
+    y = ROCArray{real(T)}(undef, p.osz)
     mul!(y, p, x)
 end
diff --git a/src/memory.jl b/src/memory.jl
index 464a1896d..13b2bcd4a 100644
--- a/src/memory.jl
+++ b/src/memory.jl
@@ -26,7 +26,7 @@ Base.unsafe_convert(::Type{Ptr{T}}, buf::Buffer) where {T} = convert(Ptr{T}, buf
 
 function view(buf::Buffer, bytes::Int)
     bytes > buf.bytesize && throw(BoundsError(buf, bytes))
-    return Buffer(buf.ptr+bytes, buf.bytesize-bytes, buf.agent)
+    return Buffer(buf.ptr+bytes, buf.bytesize-bytes, buf.agent, buf.coherent)
 end
 
 ## refcounting
diff --git a/test/rocarray/blas.jl b/test/rocarray/blas.jl
index 93dc12d57..6fa7e04e3 100644
--- a/test/rocarray/blas.jl
+++ b/test/rocarray/blas.jl
@@ -1,4 +1,7 @@
+@testset "BLAS" begin
+
 using AMDGPU.rocBLAS
+using AMDGPU.HIP
 import .rocBLAS: rocblas_int
 
 handle = rocBLAS.handle()
@@ -12,8 +15,8 @@ end
     for T in (Float32, Float64)
         A = rand(T, 8, 8)
         x = rand(T, 8)
-        RA = ROCArray(agent, A)
-        Rx = ROCArray(agent, x)
+        RA = ROCArray(A)
+        Rx = ROCArray(x)
         Rb = RA*Rx
         _b = Array(Rb)
         @test isapprox(A*x, _b)
@@ -24,12 +27,13 @@ end
     @testset "scal()" begin
         for T in (Float32, Float64)
             A = rand(T, 8)
-            RA = ROCArray(agent, A)
+            RA = ROCArray(A)
             if T === Float32
                 rocBLAS.rocblas_sscal(handle, 8, 5f0, RA, 1)
             else
                 rocBLAS.rocblas_dscal(handle, 8, 5.0, RA, 1)
             end
+            HIP.hipDeviceSynchronize()
             _A = Array(RA)
             @test isapprox(A .* 5, _A)
         end
@@ -38,13 +42,14 @@ end
         for T in (Float32, Float64)
             A = rand(T, 8)
             B = rand(T, 8)
-            RA = ROCArray(agent, A)
-            RB = ROCArray(agent, B)
+            RA = ROCArray(A)
+            RB = ROCArray(B)
             if T === Float32
                 rocBLAS.rocblas_scopy(handle, 8, RA, 1, RB, 1)
             else
                 rocBLAS.rocblas_dcopy(handle, 8, RA, 1, RB, 1)
             end
+            HIP.hipDeviceSynchronize()
             _A = Array(RA)
             _B = Array(RB)
             @test isapprox(A, _A)
@@ -56,14 +61,15 @@ end
             A = rand(T, 8)
             B = rand(T, 8)
             result = zeros(T, 8)
-            RA = ROCArray(agent, A)
-            RB = ROCArray(agent, B)
+            RA = ROCArray(A)
+            RB = ROCArray(B)
             result = Ref{T}(zero(T))
             if T === Float32
                 rocBLAS.rocblas_sdot(handle, 8, RA, 1, RB, 1, result)
             else
                 rocBLAS.rocblas_ddot(handle, 8, RA, 1, RB, 1, result)
             end
+            HIP.hipDeviceSynchronize()
             @test isapprox(LinearAlgebra.dot(A,B), result[])
         end
     end
@@ -72,13 +78,14 @@ end
             A = rand(T, 8)
             B = rand(T, 8)
             result = zeros(T, 8)
-            RA = ROCArray(agent, A)
-            RB = ROCArray(agent, B)
+            RA = ROCArray(A)
+            RB = ROCArray(B)
             if T === Float32
                 rocBLAS.rocblas_sswap(handle, 8, RA, 1, RB, 1)
             else
                 rocBLAS.rocblas_dswap(handle, 8, RA, 1, RB, 1)
             end
+            HIP.hipDeviceSynchronize()
             _A = Array(RA)
             _B = Array(RB)
             @test isapprox(A, _B)
@@ -91,11 +98,11 @@ end
     @testset "gemv()" begin
         for T in (Float32, Float64)
             A = rand(T, 8, 4)
-            RA = ROCArray(agent, A)
+            RA = ROCArray(A)
             x = rand(T, 4)
-            Rx = ROCArray(agent, x)
+            Rx = ROCArray(x)
             y = zeros(T, 8)
-            Ry = ROCArray(agent, y)
+            Ry = ROCArray(y)
             op = rocBLAS.ROCBLAS_OPERATION_NONE
             m, n = rocblas_int.(size(A))
             lda = m
@@ -105,6 +112,7 @@ end
             else
                 rocBLAS.rocblas_dgemv(handle, op, m, n, 5.0, RA, lda, Rx, incx, 0.0, Ry, incy)
             end
+            HIP.hipDeviceSynchronize()
             _A = Array(RA)
             _x = Array(Rx)
             _y = Array(Ry)
@@ -112,3 +120,5 @@ end
         end
     end
 end
+
+end # testset BLAS
diff --git a/test/rocarray/fft.jl b/test/rocarray/fft.jl
index ca071a68e..9c4da0395 100644
--- a/test/rocarray/fft.jl
+++ b/test/rocarray/fft.jl
@@ -1,6 +1,7 @@
 @testset "FFT" begin
 
-using AMDGPU: rocFFT, HIP
+using AMDGPU.rocFFT
+using AMDGPU.HIP
 using FFTW
 
 N1 = 8
@@ -22,7 +23,7 @@ end
 
 function out_of_place(X::AbstractArray{T,N}) where {T <: Complex,N}
     fftw_X = fft(X)
-    d_X = ROCArray(T, size(X))
+    d_X = ROCArray{T}(undef, size(X))
     copyto!(d_X, X)
     p = plan_fft(d_X)
     Y = zeros(T, p.osz)
@@ -43,7 +44,7 @@ end
 
 function in_place(X::AbstractArray{T,N}) where {T <: Complex,N}
     fftw_X = fft(X)
-    d_X = ROCArray(T, size(X))
+    d_X = ROCArray{T}(undef, size(X))
     copyto!(d_X, X)
     p = plan_fft!(d_X)
     p * d_X
@@ -58,7 +59,7 @@ end
 
 function batched(X::AbstractArray{T,N},region) where {T <: Complex,N}
     fftw_X = fft(X,region)
-    d_X = ROCArray(T, size(X))
+    d_X = ROCArray{T}(undef, size(X))
     copyto!(d_X, X)
     p = plan_fft!(d_X,region)
     p * d_X
@@ -157,7 +158,7 @@ end # testset Complex
 
 function out_of_place(X::AbstractArray{T,N}) where {T <: Real,N}
     fftw_X = rfft(X)
-    d_X = ROCArray(T, size(X))
+    d_X = ROCArray{T}(undef, size(X))
     copyto!(d_X, X)
     p = plan_rfft(d_X)
     d_Y = p * d_X
@@ -182,7 +183,7 @@ end
 
 function batched(X::AbstractArray{T,N},region) where {T <: Real,N}
     fftw_X = rfft(X,region)
-    d_X = ROCArray(T, size(X))
+    d_X = ROCArray{T}(undef, size(X))
     copyto!(d_X, X)
     p = plan_rfft(d_X,region)
     d_Y = p * d_X
@@ -232,7 +233,7 @@ end
     dims = (N1,N2,N3)
     for region in [(1,2),(2,3),(1,3)]
         X = rand(T, dims)
-        batched(X,region)
+        @test_skip batched(X,region)
     end
 
     X = rand(T, dims)
@@ -243,7 +244,7 @@ end
     dims = (N1,N2,N3,N4)
     for region in [(1,2),(1,4),(3,4)]
         X = rand(T, dims)
-        batched(X,region)
+        @test_skip batched(X,region)
     end
     for region in [(1,3),(2,3),(2,4)]
         X = rand(T, dims)
diff --git a/test/runtests.jl b/test/runtests.jl
index 9911663b3..1d43ff307 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,23 +7,20 @@ using InteractiveUtils
 using SpecialFunctions
 using Test
 
-#using Pkg
-#Pkg.add(PackageSpec(;name="GPUCompiler",rev="master"))
+using Random
+Random.seed!(1)
 
 include("util.jl")
 
-# copy-pasta from GPUArrays/src/testsuite.jl
-convert_array(f, x) = f(x)
-convert_array(f, x::Base.RefValue) = x[]
-function compare(f, AT::Type{ROCArray}, xs...; kwargs...)
-    cpu_in = convert_array.(copy, xs)
-    gpu_in = convert_array.(AT, xs)
-    cpu_out = f(cpu_in...; kwargs...)
-    gpu_out = f(gpu_in...; kwargs...)
-    collect(cpu_out) ≈ collect(gpu_out)
-end
-# copy-pasta from CuArrays/test/runtests.jl
-testf(f, xs...; kwargs...) = compare(f, ROCArray, xs...; kwargs...)
+# GPUArrays has a testsuite that isn't part of the main package.
+# Include it directly.
+import GPUArrays
+gpuarrays = pathof(GPUArrays)
+gpuarrays_root = dirname(dirname(gpuarrays))
+include(joinpath(gpuarrays_root, "test", "testsuite.jl"))
+
+import AMDGPU: allowscalar, @allowscalar
+allowscalar(false)
 
 agent_name = AMDGPU.get_name(get_default_agent())
 agent_isa = get_first_isa(get_default_agent())
@@ -49,7 +46,7 @@ if AMDGPU.configured
             include("codegen/synchronization.jl")
             include("codegen/trap.jl")
         end
-        @testset "Device" begin
+        @testset "Device Functions" begin
             include("device/launch.jl")
             include("device/vadd.jl")
             include("device/memory.jl")
@@ -62,6 +59,16 @@ if AMDGPU.configured
             include("device/exceptions.jl")
         end
         @testset "ROCArray" begin
+            @testset "GPUArrays test suite" begin
+                #TestSuite.test(ROCArray)
+                for name in keys(TestSuite.tests)
+                    occursin("broadcast", name) && continue
+                    name == "linear algebra" && continue
+                    @testset "$name" begin
+                        TestSuite.tests[name](ROCArray)
+                    end
+                end
+            end
             @testset "ROCm External Libraries" begin
                 isdefined(AMDGPU, :rocBLAS) ? include("rocarray/blas.jl") : @test_skip "rocBLAS"
                 isdefined(AMDGPU, :rocFFT) ? include("rocarray/fft.jl") : @test_skip "rocFFT"