From daf72125210e2b28e1d39330cd0e72af24424358 Mon Sep 17 00:00:00 2001 From: Arno Strouwen Date: Sat, 23 Sep 2023 02:15:16 +0200 Subject: [PATCH] format --- benchmarks/applelu.jl | 28 +++++++++--- benchmarks/cudalu.jl | 20 ++++++--- benchmarks/lu.jl | 29 +++++++++--- benchmarks/metallu.jl | 20 ++++++--- docs/src/solvers/solvers.md | 18 ++++---- ext/LinearSolveBlockDiagonalsExt.jl | 2 +- ext/LinearSolveKernelAbstractionsExt.jl | 2 +- ext/LinearSolveMKLExt.jl | 59 ++++++++++++++++--------- ext/LinearSolveMetalExt.jl | 2 +- src/LinearSolve.jl | 5 ++- src/appleaccelerate.jl | 56 ++++++++++++++--------- src/extension_algs.jl | 2 +- test/basictests.jl | 9 ++-- test/gpu/cuda.jl | 4 +- test/resolve.jl | 11 +++-- 15 files changed, 180 insertions(+), 87 deletions(-) diff --git a/benchmarks/applelu.jl b/benchmarks/applelu.jl index 58ef00b2c..c346ff0c9 100644 --- a/benchmarks/applelu.jl +++ b/benchmarks/applelu.jl @@ -18,7 +18,13 @@ function luflop(m, n = m; innerflop = 2) end end -algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), AppleAccelerateLUFactorization(), MetalLUFactorization()] +algs = [ + LUFactorization(), + GenericLUFactorization(), + RFLUFactorization(), + AppleAccelerateLUFactorization(), + MetalLUFactorization(), +] res = [Float32[] for i in 1:length(algs)] ns = 4:8:500 @@ -28,10 +34,14 @@ for i in 1:length(ns) rng = MersenneTwister(123) global A = rand(rng, Float32, n, n) global b = rand(rng, Float32, n) - global u0= rand(rng, Float32, n) - + global u0 = rand(rng, Float32, n) + for j in 1:length(algs) - bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true)) + bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), + copy(b); + u0 = copy(u0), + alias_A = true, + alias_b = true)) push!(res[j], luflop(n) / bt / 1e9) end end @@ -41,11 +51,17 @@ __parameterless_type(T) = Base.typename(T).wrapper parameterless_type(x) = __parameterless_type(typeof(x)) parameterless_type(::Type{T}) where {T} = __parameterless_type(T) -p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright) +p = plot(ns, + res[1]; + ylabel = "GFLOPs", + xlabel = "N", + title = "GFLOPs for NxN LU Factorization", + label = string(Symbol(parameterless_type(algs[1]))), + legend = :outertopright) for i in 2:length(res) plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i])))) end p savefig("metallubench.png") -savefig("metallubench.pdf") \ No newline at end of file +savefig("metallubench.pdf") diff --git a/benchmarks/cudalu.jl b/benchmarks/cudalu.jl index b0186f2ec..c1b89fc1c 100644 --- a/benchmarks/cudalu.jl +++ b/benchmarks/cudalu.jl @@ -28,10 +28,14 @@ for i in 1:length(ns) rng = MersenneTwister(123) global A = rand(rng, Float32, n, n) global b = rand(rng, Float32, n) - global u0= rand(rng, Float32, n) - + global u0 = rand(rng, Float32, n) + for j in 1:length(algs) - bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true)) + bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), + copy(b); + u0 = copy(u0), + alias_A = true, + alias_b = true)) push!(res[j], luflop(n) / bt / 1e9) end end @@ -41,11 +45,17 @@ __parameterless_type(T) = Base.typename(T).wrapper parameterless_type(x) = __parameterless_type(typeof(x)) parameterless_type(::Type{T}) where {T} = __parameterless_type(T) -p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright) +p = plot(ns, + res[1]; + ylabel = "GFLOPs", + xlabel = "N", + title = "GFLOPs for NxN LU Factorization", + label = string(Symbol(parameterless_type(algs[1]))), + legend = :outertopright) for i in 2:length(res) plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i])))) end p savefig("cudaoffloadlubench.png") -savefig("cudaoffloadlubench.pdf") \ No newline at end of file +savefig("cudaoffloadlubench.pdf") diff --git a/benchmarks/lu.jl b/benchmarks/lu.jl index c4f9a5da1..7ab624b45 100644 --- a/benchmarks/lu.jl +++ b/benchmarks/lu.jl @@ -18,7 +18,14 @@ function luflop(m, n = m; innerflop = 2) end end -algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), MKLLUFactorization(), FastLUFactorization(), SimpleLUFactorization()] +algs = [ + LUFactorization(), + GenericLUFactorization(), + RFLUFactorization(), + MKLLUFactorization(), + FastLUFactorization(), + SimpleLUFactorization(), +] res = [Float64[] for i in 1:length(algs)] ns = 4:8:500 @@ -28,10 +35,14 @@ for i in 1:length(ns) rng = MersenneTwister(123) global A = rand(rng, n, n) global b = rand(rng, n) - global u0= rand(rng, n) - + global u0 = rand(rng, n) + for j in 1:length(algs) - bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true)) + bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), + copy(b); + u0 = copy(u0), + alias_A = true, + alias_b = true)) push!(res[j], luflop(n) / bt / 1e9) end end @@ -41,11 +52,17 @@ __parameterless_type(T) = Base.typename(T).wrapper parameterless_type(x) = __parameterless_type(typeof(x)) parameterless_type(::Type{T}) where {T} = __parameterless_type(T) -p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright) +p = plot(ns, + res[1]; + ylabel = "GFLOPs", + xlabel = "N", + title = "GFLOPs for NxN LU Factorization", + label = string(Symbol(parameterless_type(algs[1]))), + legend = :outertopright) for i in 2:length(res) plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i])))) end p savefig("lubench.png") -savefig("lubench.pdf") \ No newline at end of file +savefig("lubench.pdf") diff --git a/benchmarks/metallu.jl b/benchmarks/metallu.jl index a9614e7a4..cc7416871 100644 --- a/benchmarks/metallu.jl +++ b/benchmarks/metallu.jl @@ -28,10 +28,14 @@ for i in 1:length(ns) rng = MersenneTwister(123) global A = rand(rng, Float32, n, n) global b = rand(rng, Float32, n) - global u0= rand(rng, Float32, n) - + global u0 = rand(rng, Float32, n) + for j in 1:length(algs) - bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true)) + bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), + copy(b); + u0 = copy(u0), + alias_A = true, + alias_b = true)) GC.gc() push!(res[j], luflop(n) / bt / 1e9) end @@ -42,11 +46,17 @@ __parameterless_type(T) = Base.typename(T).wrapper parameterless_type(x) = __parameterless_type(typeof(x)) parameterless_type(::Type{T}) where {T} = __parameterless_type(T) -p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright) +p = plot(ns, + res[1]; + ylabel = "GFLOPs", + xlabel = "N", + title = "GFLOPs for NxN LU Factorization", + label = string(Symbol(parameterless_type(algs[1]))), + legend = :outertopright) for i in 2:length(res) plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i])))) end p savefig("metal_large_lubench.png") -savefig("metal_large_lubench.pdf") \ No newline at end of file +savefig("metal_large_lubench.pdf") diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md index fe8a8b6e4..76ad95fc2 100644 --- a/docs/src/solvers/solvers.md +++ b/docs/src/solvers/solvers.md @@ -14,15 +14,15 @@ but one may need to change this to receive more performance or precision. If more precision is necessary, `QRFactorization()` and `SVDFactorization()` are the best choices, with SVD being the slowest but most precise. -For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around +For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around 150x150 matrices, though this can be dependent on the exact details of the hardware. After this point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will -use your base system BLAS which can be fast or slow depending on the hardware configuration. +use your base system BLAS which can be fast or slow depending on the hardware configuration. `SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times. For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used -on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000 +on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000 matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU. @@ -31,9 +31,9 @@ CUDA offload supports Float64 but most consumer GPU hardware will be much faster this is only recommended for Float32 matrices. !!! note - - Performance details for dense LU-factorizations can be highly dependent on the hardware configuration. - For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357). + + Performance details for dense LU-factorizations can be highly dependent on the hardware configuration. + For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357). If one is looking to best optimize their system, we suggest running the performance tuning benchmark. @@ -65,7 +65,7 @@ The interface is detailed [here](@ref custom). ### Lazy SciMLOperators If the linear operator is given as a lazy non-concrete operator, such as a `FunctionOperator`, -then using a Krylov method is preferred in order to not concretize the matrix. +then using a Krylov method is preferred in order to not concretize the matrix. Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The choice of Krylov method should be the one most constrained to the type of operator one @@ -73,11 +73,11 @@ has, for example if positive definite then `Krylov_CG()`, but if no good propert use `Krylov_GMRES()`. !!! tip - + If your materialized operator is a uniform block diagonal matrix, then you can use `SimpleGMRES(; blocksize = )` to further improve performance. This often shows up in Neural Networks where the Jacobian wrt the Inputs (almost always) - is a Uniform Block Diagonal matrix of Block Size = size of the input divided by the + is a Uniform Block Diagonal matrix of Block Size = size of the input divided by the batch size. ## Full List of Methods diff --git a/ext/LinearSolveBlockDiagonalsExt.jl b/ext/LinearSolveBlockDiagonalsExt.jl index 1e9b053eb..4b1827200 100644 --- a/ext/LinearSolveBlockDiagonalsExt.jl +++ b/ext/LinearSolveBlockDiagonalsExt.jl @@ -4,7 +4,7 @@ using LinearSolve, BlockDiagonals function LinearSolve.init_cacheval(alg::SimpleGMRES{false}, A::BlockDiagonal, b, args...; kwargs...) - @assert ndims(A) == 2 "ndims(A) == $(ndims(A)). `A` must have ndims == 2." + @assert ndims(A)==2 "ndims(A) == $(ndims(A)). `A` must have ndims == 2." # We need to perform this check even when `zeroinit == true`, since the type of the # cache is dependent on whether we are able to use the specialized dispatch. bsizes = blocksizes(A) diff --git a/ext/LinearSolveKernelAbstractionsExt.jl b/ext/LinearSolveKernelAbstractionsExt.jl index ba620382f..52a0a10e0 100644 --- a/ext/LinearSolveKernelAbstractionsExt.jl +++ b/ext/LinearSolveKernelAbstractionsExt.jl @@ -9,7 +9,7 @@ using GPUArraysCore function LinearSolve._fast_sym_givens!(c, s, R, nr::Int, inner_iter::Int, bsize::Int, Hbis) backend = get_backend(Hbis) kernel! = __fast_sym_givens_kernel!(backend) - kernel!(c[inner_iter], s[inner_iter], R[nr + inner_iter], Hbis; ndrange=bsize) + kernel!(c[inner_iter], s[inner_iter], R[nr + inner_iter], Hbis; ndrange = bsize) return c, s, R end diff --git a/ext/LinearSolveMKLExt.jl b/ext/LinearSolveMKLExt.jl index 1ed917f58..a094abe7e 100644 --- a/ext/LinearSolveMKLExt.jl +++ b/ext/LinearSolveMKLExt.jl @@ -2,49 +2,60 @@ module LinearSolveMKLExt using MKL_jll using LinearAlgebra: BlasInt, LU -using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1, - @blasfunc, chkargsok +using LinearAlgebra.LAPACK: require_one_based_indexing, + chkfinite, chkstride1, + @blasfunc, chkargsok using LinearAlgebra const usemkl = MKL_jll.is_available() using LinearSolve using LinearSolve: ArrayInterface, MKLLUFactorization, @get_cacheval, LinearCache, SciMLBase -function getrf!(A::AbstractMatrix{<:Float64}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false) +function getrf!(A::AbstractMatrix{<:Float64}; + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), + info = Ref{BlasInt}(), + check = false) require_one_based_indexing(A) check && chkfinite(A) chkstride1(A) m, n = size(A) - lda = max(1,stride(A, 2)) + lda = max(1, stride(A, 2)) if isempty(ipiv) - ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))) + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))) end ccall((@blasfunc(dgetrf_), MKL_jll.libmkl_rt), Cvoid, - (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, + (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}), - m, n, A, lda, ipiv, info) + m, n, A, lda, ipiv, info) chkargsok(info[]) A, ipiv, info[], info #Error code is stored in LU factorization type end -function getrf!(A::AbstractMatrix{<:Float32}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false) +function getrf!(A::AbstractMatrix{<:Float32}; + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), + info = Ref{BlasInt}(), + check = false) require_one_based_indexing(A) check && chkfinite(A) chkstride1(A) m, n = size(A) - lda = max(1,stride(A, 2)) + lda = max(1, stride(A, 2)) if isempty(ipiv) - ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))) + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))) end ccall((@blasfunc(sgetrf_), MKL_jll.libmkl_rt), Cvoid, - (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, + (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}), - m, n, A, lda, ipiv, info) + m, n, A, lda, ipiv, info) chkargsok(info[]) A, ipiv, info[], info #Error code is stored in LU factorization type end -function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float64}; info = Ref{BlasInt}()) +function getrs!(trans::AbstractChar, + A::AbstractMatrix{<:Float64}, + ipiv::AbstractVector{BlasInt}, + B::AbstractVecOrMat{<:Float64}; + info = Ref{BlasInt}()) require_one_based_indexing(A, ipiv, B) LinearAlgebra.LAPACK.chktrans(trans) chkstride1(A, B, ipiv) @@ -57,14 +68,19 @@ function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::Abstrac end nrhs = size(B, 2) ccall(("dgetrs_", MKL_jll.libmkl_rt), Cvoid, - (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt}, - Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong), - trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1) + (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) B end -function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float32}; info = Ref{BlasInt}()) +function getrs!(trans::AbstractChar, + A::AbstractMatrix{<:Float32}, + ipiv::AbstractVector{BlasInt}, + B::AbstractVecOrMat{<:Float32}; + info = Ref{BlasInt}()) require_one_based_indexing(A, ipiv, B) LinearAlgebra.LAPACK.chktrans(trans) chkstride1(A, B, ipiv) @@ -77,9 +93,10 @@ function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::Abstrac end nrhs = size(B, 2) ccall(("sgetrs_", MKL_jll.libmkl_rt), Cvoid, - (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt}, - Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong), - trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1) + (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) B end @@ -125,4 +142,4 @@ function SciMLBase.solve!(cache::LinearCache, alg::MKLLUFactorization; =# end -end \ No newline at end of file +end diff --git a/ext/LinearSolveMetalExt.jl b/ext/LinearSolveMetalExt.jl index 4ee07dc39..2e963f514 100644 --- a/ext/LinearSolveMetalExt.jl +++ b/ext/LinearSolveMetalExt.jl @@ -28,4 +28,4 @@ function SciMLBase.solve!(cache::LinearCache, alg::MetalLUFactorization; SciMLBase.build_linear_solution(alg, y, nothing, cache) end -end \ No newline at end of file +end diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl index 644c86288..7e96f0203 100644 --- a/src/LinearSolve.jl +++ b/src/LinearSolve.jl @@ -28,8 +28,9 @@ PrecompileTools.@recompile_invalidations begin import InteractiveUtils using LinearAlgebra: BlasInt, LU - using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1, - @blasfunc, chkargsok + using LinearAlgebra.LAPACK: require_one_based_indexing, + chkfinite, chkstride1, + @blasfunc, chkargsok import GPUArraysCore import Preferences diff --git a/src/appleaccelerate.jl b/src/appleaccelerate.jl index 0b7ea7cae..f1310ec16 100644 --- a/src/appleaccelerate.jl +++ b/src/appleaccelerate.jl @@ -2,7 +2,7 @@ using LinearAlgebra using Libdl # For now, only use BLAS from Accelerate (that is to say, vecLib) -global const libacc = "/System/Library/Frameworks/Accelerate.framework/Accelerate" +const global libacc = "/System/Library/Frameworks/Accelerate.framework/Accelerate" """ ```julia @@ -26,43 +26,53 @@ function appleaccelerate_isavailable() return true end -function aa_getrf!(A::AbstractMatrix{<:Float64}; ipiv = similar(A, Cint, min(size(A,1),size(A,2))), info = Ref{Cint}(), check = false) +function aa_getrf!(A::AbstractMatrix{<:Float64}; + ipiv = similar(A, Cint, min(size(A, 1), size(A, 2))), + info = Ref{Cint}(), + check = false) require_one_based_indexing(A) check && chkfinite(A) chkstride1(A) m, n = size(A) - lda = max(1,stride(A, 2)) + lda = max(1, stride(A, 2)) if isempty(ipiv) - ipiv = similar(A, Cint, min(size(A,1),size(A,2))) + ipiv = similar(A, Cint, min(size(A, 1), size(A, 2))) end ccall(("dgetrf_", libacc), Cvoid, - (Ref{Cint}, Ref{Cint}, Ptr{Float64}, + (Ref{Cint}, Ref{Cint}, Ptr{Float64}, Ref{Cint}, Ptr{Cint}, Ptr{Cint}), - m, n, A, lda, ipiv, info) + m, n, A, lda, ipiv, info) info[] < 0 && throw(ArgumentError("Invalid arguments sent to LAPACK dgetrf_")) A, ipiv, BlasInt(info[]), info #Error code is stored in LU factorization type end -function aa_getrf!(A::AbstractMatrix{<:Float32}; ipiv = similar(A, Cint, min(size(A,1),size(A,2))), info = Ref{Cint}(), check = false) +function aa_getrf!(A::AbstractMatrix{<:Float32}; + ipiv = similar(A, Cint, min(size(A, 1), size(A, 2))), + info = Ref{Cint}(), + check = false) require_one_based_indexing(A) check && chkfinite(A) chkstride1(A) m, n = size(A) - lda = max(1,stride(A, 2)) + lda = max(1, stride(A, 2)) if isempty(ipiv) - ipiv = similar(A, Cint, min(size(A,1),size(A,2))) + ipiv = similar(A, Cint, min(size(A, 1), size(A, 2))) end ccall(("sgetrf_", libacc), Cvoid, - (Ref{Cint}, Ref{Cint}, Ptr{Float32}, + (Ref{Cint}, Ref{Cint}, Ptr{Float32}, Ref{Cint}, Ptr{Cint}, Ptr{Cint}), - m, n, A, lda, ipiv, info) + m, n, A, lda, ipiv, info) info[] < 0 && throw(ArgumentError("Invalid arguments sent to LAPACK dgetrf_")) A, ipiv, BlasInt(info[]), info #Error code is stored in LU factorization type end -function aa_getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::AbstractVector{Cint}, B::AbstractVecOrMat{<:Float64}; info = Ref{Cint}()) +function aa_getrs!(trans::AbstractChar, + A::AbstractMatrix{<:Float64}, + ipiv::AbstractVector{Cint}, + B::AbstractVecOrMat{<:Float64}; + info = Ref{Cint}()) require_one_based_indexing(A, ipiv, B) LinearAlgebra.LAPACK.chktrans(trans) chkstride1(A, B, ipiv) @@ -75,14 +85,19 @@ function aa_getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::Abst end nrhs = size(B, 2) ccall(("dgetrs_", libacc), Cvoid, - (Ref{UInt8}, Ref{Cint}, Ref{Cint}, Ptr{Float64}, Ref{Cint}, - Ptr{Cint}, Ptr{Float64}, Ref{Cint}, Ptr{Cint}, Clong), - trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1) + (Ref{UInt8}, Ref{Cint}, Ref{Cint}, Ptr{Float64}, Ref{Cint}, + Ptr{Cint}, Ptr{Float64}, Ref{Cint}, Ptr{Cint}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) B end -function aa_getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::AbstractVector{Cint}, B::AbstractVecOrMat{<:Float32}; info = Ref{Cint}()) +function aa_getrs!(trans::AbstractChar, + A::AbstractMatrix{<:Float32}, + ipiv::AbstractVector{Cint}, + B::AbstractVecOrMat{<:Float32}; + info = Ref{Cint}()) require_one_based_indexing(A, ipiv, B) LinearAlgebra.LAPACK.chktrans(trans) chkstride1(A, B, ipiv) @@ -95,9 +110,10 @@ function aa_getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::Abst end nrhs = size(B, 2) ccall(("sgetrs_", libacc), Cvoid, - (Ref{UInt8}, Ref{Cint}, Ref{Cint}, Ptr{Float32}, Ref{Cint}, - Ptr{Cint}, Ptr{Float32}, Ref{Cint}, Ptr{Cint}, Clong), - trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1) + (Ref{UInt8}, Ref{Cint}, Ref{Cint}, Ptr{Float32}, Ref{Cint}, + Ptr{Cint}, Ptr{Float32}, Ref{Cint}, Ptr{Cint}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) B end @@ -109,7 +125,7 @@ function LinearSolve.init_cacheval(alg::AppleAccelerateLUFactorization, A, b, u, maxiters::Int, abstol, reltol, verbose::Bool, assumptions::OperatorAssumptions) luinst = ArrayInterface.lu_instance(convert(AbstractMatrix, A)) - LU(luinst.factors,similar(A, Cint, 0), luinst.info), Ref{Cint}() + LU(luinst.factors, similar(A, Cint, 0), luinst.info), Ref{Cint}() end function SciMLBase.solve!(cache::LinearCache, alg::AppleAccelerateLUFactorization; diff --git a/src/extension_algs.jl b/src/extension_algs.jl index 8d3e3134e..dfd6aa5b9 100644 --- a/src/extension_algs.jl +++ b/src/extension_algs.jl @@ -357,4 +357,4 @@ MetalLUFactorization() A wrapper over Apple's Metal GPU library. Direct calls to Metal in a way that pre-allocates workspace to avoid allocations and automatically offloads to the GPU. """ -struct MetalLUFactorization <: AbstractFactorization end \ No newline at end of file +struct MetalLUFactorization <: AbstractFactorization end diff --git a/test/basictests.jl b/test/basictests.jl index 42f283173..1cf3b2f49 100644 --- a/test/basictests.jl +++ b/test/basictests.jl @@ -426,7 +426,6 @@ end @testset "DirectLdiv!" begin function get_operator(A, u; add_inverse = true) - function f(u, p, t) println("using FunctionOperator OOP mul") A * u @@ -479,7 +478,9 @@ end end # testset # https://github.com/SciML/LinearSolve.jl/issues/347 -A = rand(4, 4); b = rand(4); u0 = zeros(4); +A = rand(4, 4); +b = rand(4); +u0 = zeros(4); lp = LinearProblem(A, b; u0 = view(u0, :)); truesol = solve(lp, LUFactorization()) krylovsol = solve(lp, KrylovJL_GMRES()) @@ -505,7 +506,7 @@ using BlockDiagonals prob1 = LinearProblem(Array(A), b, x1) prob2 = LinearProblem(Array(A), b, x2) - test_interface(SimpleGMRES(; blocksize=2), prob1, prob2) + test_interface(SimpleGMRES(; blocksize = 2), prob1, prob2) - @test solve(prob1, SimpleGMRES(; blocksize=2)).u ≈ solve(prob2, SimpleGMRES()).u + @test solve(prob1, SimpleGMRES(; blocksize = 2)).u ≈ solve(prob2, SimpleGMRES()).u end diff --git a/test/gpu/cuda.jl b/test/gpu/cuda.jl index 042576300..7402275eb 100644 --- a/test/gpu/cuda.jl +++ b/test/gpu/cuda.jl @@ -67,7 +67,7 @@ using BlockDiagonals prob1 = LinearProblem(A, b, x1) prob2 = LinearProblem(A, b, x2) - test_interface(SimpleGMRES(; blocksize=2), prob1, prob2) + test_interface(SimpleGMRES(; blocksize = 2), prob1, prob2) - @test solve(prob1, SimpleGMRES(; blocksize=2)).u ≈ solve(prob2, SimpleGMRES()).u + @test solve(prob1, SimpleGMRES(; blocksize = 2)).u ≈ solve(prob2, SimpleGMRES()).u end diff --git a/test/resolve.jl b/test/resolve.jl index d49d5b1b6..ca5147b7e 100644 --- a/test/resolve.jl +++ b/test/resolve.jl @@ -2,9 +2,14 @@ using LinearSolve, LinearAlgebra, SparseArrays, InteractiveUtils, Test for alg in subtypes(LinearSolve.AbstractFactorization) @show alg - if !(alg in [DiagonalFactorization, CudaOffloadFactorization, AppleAccelerateLUFactorization, MetalLUFactorization]) && - (!(alg == AppleAccelerateLUFactorization) || LinearSolve.appleaccelerate_isavailable()) - + if !(alg in [ + DiagonalFactorization, + CudaOffloadFactorization, + AppleAccelerateLUFactorization, + MetalLUFactorization, + ]) && + (!(alg == AppleAccelerateLUFactorization) || + LinearSolve.appleaccelerate_isavailable()) A = [1.0 2.0; 3.0 4.0] alg in [KLUFactorization, UMFPACKFactorization, SparspakFactorization] && (A = sparse(A))