diff --git a/README.md b/README.md index 86857dbf3..2d2abf35b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Krylov.jl: A Julia basket of hand-picked Krylov methods | **Documentation** | **CI** | **Coverage** | **DOI** | **Downloads** | -|:-----------------:|:-------------------------------:|:------------:|:-------:|:-------------:| +|:-----------------:|:------:|:------------:|:-------:|:-------------:| | [![docs-stable][docs-stable-img]][docs-stable-url] [![docs-dev][docs-dev-img]][docs-dev-url] | [![build-gh][build-gh-img]][build-gh-url] [![build-cirrus][build-cirrus-img]][build-cirrus-url] | [![codecov][codecov-img]][codecov-url] | [![doi][doi-img]][doi-url] | [![downloads][downloads-img]][downloads-url] | [docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg diff --git a/docs/make.jl b/docs/make.jl index 132b8bbd2..0595a9398 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -12,6 +12,7 @@ makedocs( pages = ["Home" => "index.md", "API" => "api.md", "Krylov processes" => "processes.md", + "Block Krylov processes" => "block_processes.md", "Krylov methods" => ["Hermitian positive definite linear systems" => "solvers/spd.md", "Hermitian indefinite linear systems" => "solvers/sid.md", "Non-Hermitian square linear systems" => "solvers/unsymmetric.md", diff --git a/docs/src/block_processes.md b/docs/src/block_processes.md new file mode 100644 index 000000000..d6e3e5995 --- /dev/null +++ b/docs/src/block_processes.md @@ -0,0 +1,227 @@ +# [Block Krylov processes](@id block-krylov-processes) + +## [Block Hermitian Lanczos](@id block-hermitian-lanczos) + +If the vector $b$ in the [Hermitian Lanczos](@ref hermitian-lanczos) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Hermitian Lanczos process. + +![block_hermitian_lanczos](./graphics/block_hermitian_lanczos.png) + +After $k$ iterations of the block Hermitian Lanczos process, the situation may be summarized as +```math +\begin{align*} + A V_k &= V_{k+1} T_{k+1,k}, \\ + V_k^H V_k &= I_{pk}, +\end{align*} +``` +where $V_k$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}_k^{\square}(A,B)$, +```math +T_{k+1,k} = +\begin{bmatrix} + \Omega_1 & \Psi_2^H & & \\ + \Psi_2 & \Omega_2 & \ddots & \\ + & \ddots & \ddots & \Psi_k^H \\ + & & \Psi_k & \Omega_k \\ + & & & \Psi_{k+1} +\end{bmatrix}. +``` + +The function [`hermitian_lanczos`](@ref hermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Psi_1$ and $T_{k+1,k}$. + +```@docs +hermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Block Non-Hermitian Lanczos](@id block-nonhermitian-lanczos) + +If the vectors $b$ and $c$ in the [non-Hermitian Lanczos](@ref nonhermitian-lanczos) process are replaced by matrices $B$ and $C$ with both $p$ columns, we can derive the block non-Hermitian Lanczos process. + +![block_nonhermitian_lanczos](./graphics/block_nonhermitian_lanczos.png) + +After $k$ iterations of the block non-Hermitian Lanczos process, the situation may be summarized as +```math +\begin{align*} + A V_k &= V_{k+1} T_{k+1,k}, \\ + A^H U_k &= U_{k+1} T_{k,k+1}^H, \\ + V_k^H U_k &= U_k^H V_k = I_{pk}, +\end{align*} +``` +where $V_k$ and $U_k$ are bases of the block Krylov subspaces $\mathcal{K}^{\square}_k(A,B)$ and $\mathcal{K}^{\square}_k (A^H,C)$, respectively, +```math +T_{k+1,k} = +\begin{bmatrix} + \Omega_1 & \Phi_2 & & \\ + \Psi_2 & \Omega_2 & \ddots & \\ + & \ddots & \ddots & \Phi_k \\ + & & \Psi_k & \Omega_k \\ + & & & \Psi_{k+1} +\end{bmatrix} +, \qquad +T_{k,k+1}^H = +\begin{bmatrix} + \Omega_1^H & \Psi_2^H & & \\ + \Phi_2^H & \Omega_2^H & \ddots & \\ + & \ddots & \ddots & \Psi_k^H \\ + & & \Phi_k^H & \Omega_k^H \\ + & & & \Phi_{k+1}^H +\end{bmatrix}. +``` + +The function [`nonhermitian_lanczos`](@ref nonhermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Psi_1$, $T_{k+1,k}$, $U_{k+1}$ $\Phi_1^H$ and $T_{k,k+1}^H$. + +```@docs +nonhermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Block Arnoldi](@id block-arnoldi) + +If the vector $b$ in the [Arnoldi](@ref arnoldi) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Arnoldi process. + +![block_arnoldi](./graphics/block_arnoldi.png) + +After $k$ iterations of the block Arnoldi process, the situation may be summarized as +```math +\begin{align*} + A V_k &= V_{k+1} H_{k+1,k}, \\ + V_k^H V_k &= I_{pk}, +\end{align*} +``` +where $V_k$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}_k^{\square}(A,B)$, +```math +H_{k+1,k} = +\begin{bmatrix} + \Psi_{1,1}~ & \Psi_{1,2}~ & \ldots & \Psi_{1,k} \\ + \Psi_{2,1}~ & \ddots~ & \ddots & \vdots \\ + & \ddots~ & \ddots & \Psi_{k-1,k} \\ + & & \Psi_{k,k-1} & \Psi_{k,k} \\ + & & & \Psi_{k+1,k} +\end{bmatrix}. +``` + +The function [`arnoldi`](@ref arnoldi(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Gamma$, and $H_{k+1,k}$. + +```@docs +arnoldi(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Block Golub-Kahan](@id block-golub-kahan) + +If the vector $b$ in the [Golub-Kahan](@ref golub-kahan) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Golub-Kahan process. + +![block_golub_kahan](./graphics/block_golub_kahan.png) + +After $k$ iterations of the block Golub-Kahan process, the situation may be summarized as +```math +\begin{align*} + A V_k &= U_{k+1} B_k, \\ + A^H U_{k+1} &= V_{k+1} L_{k+1}^H, \\ + V_k^H V_k &= U_k^H U_k = I_{pk}, +\end{align*} +``` +where $V_k$ and $U_k$ are bases of the block Krylov subspaces $\mathcal{K}_k^{\square}(A^HA,A^HB)$ and $\mathcal{K}_k^{\square}(AA^H,B)$, respectively, +```math +B_k = +\begin{bmatrix} + \Omega_1 & & & \\ + \Psi_2 & \Omega_2 & & \\ + & \ddots & \ddots & \\ + & & \Psi_k & \Omega_k \\ + & & & \Psi_{k+1} \\ +\end{bmatrix} +, \qquad +L_{k+1}^H = +\begin{bmatrix} + \Omega_1^H & \Psi_2^H & & & \\ + & \Omega_2^H & \ddots & & \\ + & & \ddots & \Psi_k^H & \\ + & & & \Omega_k^H & \Psi_{k+1}^H \\ + & & & & \Omega_{k+1}^H \\ +\end{bmatrix}. +``` + +The function [`golub_kahan`](@ref golub_kahan(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $U_{k+1}$, $\Psi_1$ and $L_{k+1}$. + +```@docs +golub_kahan(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Block Saunders-Simon-Yip](@id block-saunders-simon-yip) + +If the vectors $b$ and $c$ in the [Saunders-Simon-Yip](@ref saunders-simon-yip) process are replaced by matrices $B$ and $C$ with both $p$ columns, we can derive the block Saunders-Simon-Yip process. + +![block_saunders_simon_yip](./graphics/block_saunders_simon_yip.png) + +After $k$ iterations of the block Saunders-Simon-Yip process, the situation may be summarized as +```math +\begin{align*} + A U_k &= V_{k+1} T_{k+1,k}, \\ + A^H V_k &= U_{k+1} T_{k,k+1}^H, \\ + V_k^H V_k &= U_k^H U_k = I_{pk}, +\end{align*} +``` +where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}, \begin{bmatrix} B & 0 \\ 0 & C \end{bmatrix}\right)$, +```math +T_{k+1,k} = +\begin{bmatrix} + \Omega_1 & \Phi_2 & & \\ + \Psi_2 & \Omega_2 & \ddots & \\ + & \ddots & \ddots & \Phi_k \\ + & & \Psi_k & \Omega_k \\ + & & & \Psi_{k+1} +\end{bmatrix} +, \qquad +T_{k,k+1}^H = +\begin{bmatrix} + \Omega_1^H & \Psi_2^H & & \\ + \Phi_2^H & \Omega_2^H & \ddots & \\ + & \ddots & \ddots & \Psi_k^H \\ + & & \Phi_k^H & \Omega_k^H \\ + & & & \Phi_{k+1}^H +\end{bmatrix}. +``` + +The function [`saunders_simon_yip`](@ref saunders_simon_yip(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Psi_1$, $T_{k+1,k}$, $U_{k+1}$, $\Phi_1^H$ and $T_{k,k+1}^H$. + +```@docs +saunders_simon_yip(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Block Montoison-Orban](@id block-montoison-orban) + +If the vectors $b$ and $c$ in the [Montoison-Orban](@ref montoison-orban) process are replaced by matrices $D$ and $C$ with both $p$ columns, we can derive the block Montoison-Orban process. + +![block_montoison_orban](./graphics/block_montoison_orban.png) + +After $k$ iterations of the block Montoison-Orban process, the situation may be summarized as +```math +\begin{align*} + A U_k &= V_{k+1} H_{k+1,k}, \\ + B V_k &= U_{k+1} F_{k+1,k}, \\ + V_k^H V_k &= U_k^H U_k = I_{pk}, +\end{align*} +``` +where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ B & 0 \end{bmatrix}, \begin{bmatrix} D & 0 \\ 0 & C \end{bmatrix}\right)$, +```math +H_{k+1,k} = +\begin{bmatrix} + \Psi_{1,1}~ & \Psi_{1,2}~ & \ldots & \Psi_{1,k} \\ + \Psi_{2,1}~ & \ddots~ & \ddots & \vdots \\ + & \ddots~ & \ddots & \Psi_{k-1,k} \\ + & & \Psi_{k,k-1} & \Psi_{k,k} \\ + & & & \Psi_{k+1,k} +\end{bmatrix} +, \qquad +F_{k+1,k} = +\begin{bmatrix} + \Phi_{1,1}~ & \Phi_{1,2}~ & \ldots & \Phi_{1,k} \\ + \Phi_{2,1}~ & \ddots~ & \ddots & \vdots \\ + & \ddots~ & \ddots & \Phi_{k-1,k} \\ + & & \Phi_{k,k-1} & \Phi_{k,k} \\ + & & & \Phi_{k+1,k} +\end{bmatrix}. +``` + +The function [`montoison_orban`](@ref montoison_orban(::Any, ::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Gamma$, $H_{k+1,k}$, $U_{k+1}$, $\Lambda$, and $F_{k+1,k}$. + +```@docs +montoison_orban(::Any, ::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` diff --git a/docs/src/graphics/block_arnoldi.png b/docs/src/graphics/block_arnoldi.png new file mode 100644 index 000000000..5248046ea Binary files /dev/null and b/docs/src/graphics/block_arnoldi.png differ diff --git a/docs/src/graphics/block_golub_kahan.png b/docs/src/graphics/block_golub_kahan.png new file mode 100644 index 000000000..e749edc54 Binary files /dev/null and b/docs/src/graphics/block_golub_kahan.png differ diff --git a/docs/src/graphics/block_hermitian_lanczos.png b/docs/src/graphics/block_hermitian_lanczos.png new file mode 100644 index 000000000..d16647a84 Binary files /dev/null and b/docs/src/graphics/block_hermitian_lanczos.png differ diff --git a/docs/src/graphics/block_montoison_orban.png b/docs/src/graphics/block_montoison_orban.png new file mode 100644 index 000000000..48d4d754c Binary files /dev/null and b/docs/src/graphics/block_montoison_orban.png differ diff --git a/docs/src/graphics/block_nonhermitian_lanczos.png b/docs/src/graphics/block_nonhermitian_lanczos.png new file mode 100644 index 000000000..a18052224 Binary files /dev/null and b/docs/src/graphics/block_nonhermitian_lanczos.png differ diff --git a/docs/src/graphics/block_saunders_simon_yip.png b/docs/src/graphics/block_saunders_simon_yip.png new file mode 100644 index 000000000..96dcc4615 Binary files /dev/null and b/docs/src/graphics/block_saunders_simon_yip.png differ diff --git a/src/Krylov.jl b/src/Krylov.jl index 05069d14b..e2eb70952 100644 --- a/src/Krylov.jl +++ b/src/Krylov.jl @@ -5,7 +5,10 @@ using LinearAlgebra, SparseArrays, Printf include("krylov_utils.jl") include("krylov_stats.jl") include("krylov_solvers.jl") + +include("processes_utils.jl") include("krylov_processes.jl") +include("block_krylov_processes.jl") include("cg.jl") include("cr.jl") diff --git a/src/block_krylov_processes.jl b/src/block_krylov_processes.jl new file mode 100644 index 000000000..3848de063 --- /dev/null +++ b/src/block_krylov_processes.jl @@ -0,0 +1,666 @@ +""" + V, Ψ, T = hermitian_lanczos(A, B, k; algo="householder") + +#### Input arguments + +* `A`: a linear operator that models an Hermitian matrix of dimension n; +* `B`: a matrix of size n × p; +* `k`: the number of iterations of the block Hermitian Lanczos process. + +#### Keyword arguments + +* `algo`: the algorithm to perform reduced QR factorizations ("gs", "mgs", "givens" or "householder"). + +#### Output arguments + +* `V`: a dense n × p(k+1) matrix; +* `Ψ`: a dense p × p upper triangular matrix such that V₁Ψ = B; +* `T`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p. +""" +function hermitian_lanczos(A, B::AbstractMatrix{FC}, k::Int; algo::String="householder") where FC <: FloatOrComplex + m, n = size(A) + t, p = size(B) + + nnzT = p*p + (k-1)*p*(2*p+1) + div(p*(p+1), 2) + colptr = zeros(Int, p*k+1) + rowval = zeros(Int, nnzT) + nzval = zeros(FC, nnzT) + + colptr[1] = 1 + for j = 1:k*p + pos = colptr[j] + for i = max(1, j-p):j+p + rowval[pos] = i + pos += 1 + end + colptr[j+1] = pos + end + + V = zeros(FC, n, (k+1)*p) + T = SparseMatrixCSC((k+1)*p, k*p, colptr, rowval, nzval) + + α = -one(FC) + β = one(FC) + q = zeros(FC, n, p) + ψ₁ = zeros(FC, p, p) + Ωᵢ = Ψᵢ = Ψᵢ₊₁ = zeros(FC, p, p) + + for i = 1:k + pos1 = (i-1)*p + 1 + pos2 = i*p + pos3 = pos1 + p + pos4 = pos2 + p + vᵢ = view(V,:,pos1:pos2) + vᵢ₊₁ = view(V,:,pos3:pos4) + + if i == 1 + q .= B + reduced_qr!(q, ψ₁, algo) + vᵢ .= q + end + + mul!(q, A, vᵢ) + + if i ≥ 2 + pos5 = pos1 - p + pos6 = pos2 - p + vᵢ₋₁ = view(V,:,pos5:pos6) + mul!(q, vᵢ₋₁, Ψᵢ', α, β) # q = q - vᵢ₋₁ * Ψᵢᴴ + end + + mul!(Ωᵢ, vᵢ', q) # Ωᵢ = vᵢᴴ * q + mul!(q, vᵢ, Ωᵢ, α, β) # q = q - vᵢ * Ωᵢᴴ + + # Store the block Ωᵢ in Tₖ₊₁.ₖ + for ii = 1:p + indi = pos1+ii-1 + for jj = 1:p + indj = pos1+jj-1 + T[indi,indj] = Ωᵢ[ii,jj] + end + end + + reduced_qr!(q, Ψᵢ₊₁, algo) + vᵢ₊₁ .= q + + # Store the block Ψᵢ₊₁ in Tₖ₊₁.ₖ + for ii = 1:p + indi = pos3+ii-1 + for jj = 1:p + indj = pos1+jj-1 + (ii ≤ jj) && (T[indi,indj] = Ψᵢ₊₁[ii,jj]) + (ii ≤ jj) && (i < k) && (T[indj,indi] = conj(Ψᵢ₊₁[ii,jj])) + end + end + end + return V, ψ₁, T +end + +""" + V, Ψ, T, U, Φᴴ, Tᴴ = nonhermitian_lanczos(A, B, C, k) + +#### Input arguments + +* `A`: a linear operator that models a square matrix of dimension n; +* `B`: a matrix of size n × p; +* `C`: a matrix of size n × p; +* `k`: the number of iterations of the block non-Hermitian Lanczos process. + +#### Output arguments + +* `V`: a dense n × p(k+1) matrix; +* `Ψ`: a dense p × p upper triangular matrix such that V₁Ψ = B; +* `T`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p; +* `U`: a dense n × p(k+1) matrix; +* `Φᴴ`: a dense p × p upper triangular matrix such that U₁Φᴴ = C; +* `Tᴴ`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p. +""" +function nonhermitian_lanczos(A, B::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int) where FC <: FloatOrComplex + m, n = size(A) + t, p = size(B) + Aᴴ = A' + pivoting = VERSION < v"1.9" ? Val{false}() : NoPivot() + + nnzT = p*p + (k-1)*p*(2*p+1) + div(p*(p+1), 2) + colptr = zeros(Int, p*k+1) + rowval = zeros(Int, nnzT) + nzval_T = zeros(FC, nnzT) + nzval_Tᴴ = zeros(FC, nnzT) + + colptr[1] = 1 + for j = 1:k*p + pos = colptr[j] + for i = max(1, j-p):j+p + rowval[pos] = i + pos += 1 + end + colptr[j+1] = pos + end + + V = zeros(FC, n, (k+1)*p) + U = zeros(FC, n, (k+1)*p) + T = SparseMatrixCSC((k+1)*p, k*p, colptr, rowval, nzval_T) + Tᴴ = SparseMatrixCSC((k+1)*p, k*p, colptr, rowval, nzval_Tᴴ) + + α = -one(FC) + β = one(FC) + qᵥ = zeros(FC, n, p) + qᵤ = zeros(FC, n, p) + D = Ωᵢ = zeros(FC, p, p) + Ψ₁ = zeros(FC, p, p) + Φ₁ᴴ = zeros(FC, p, p) + + local Φᵢ, Ψᵢ + + for i = 1:k + pos1 = (i-1)*p + 1 + pos2 = i*p + pos3 = pos1 + p + pos4 = pos2 + p + vᵢ = view(V,:,pos1:pos2) + vᵢ₊₁ = view(V,:,pos3:pos4) + uᵢ = view(U,:,pos1:pos2) + uᵢ₊₁ = view(U,:,pos3:pos4) + + if i == 1 + mul!(D, C', B) # D = Cᴴ * B + F = lu(D, pivoting) + Φᵢ, Ψᵢ = F.L, F.U # Φᵢ = F.P' * Φᵢ with pivoting + Ψ₁ .= Ψᵢ + Φ₁ᴴ .= Φᵢ' + # vᵢ .= (Ψᵢ' \ B')' + # uᵢ .= (Φᵢ \ C')' + ldiv!(vᵢ', UpperTriangular(Ψᵢ)', B') + ldiv!(uᵢ', LowerTriangular(Φᵢ), C') + end + + mul!(qᵥ, A, vᵢ) + mul!(qᵤ, Aᴴ, uᵢ) + + if i ≥ 2 + pos5 = pos1 - p + pos6 = pos2 - p + vᵢ₋₁ = view(V,:,pos5:pos6) + uᵢ₋₁ = view(U,:,pos5:pos6) + + mul!(qᵥ, vᵢ₋₁, Φᵢ, α, β) # qᵥ = qᵥ - vᵢ₋₁ * Φᵢ + mul!(qᵤ, uᵢ₋₁, Ψᵢ', α, β) # qᵤ = qᵤ - uᵢ₋₁ * Ψᵢᴴ + end + + mul!(Ωᵢ, uᵢ', qᵥ) + mul!(qᵥ, vᵢ, Ωᵢ, α, β) # qᵥ = qᵥ - vᵢ * Ωᵢ + mul!(qᵤ, uᵢ, Ωᵢ', α, β) # qᵤ = qᵤ - uᵢ * Ωᵢᴴ + + # Store the block Ωᵢ in Tₖ₊₁.ₖ + for ii = 1:p + indi = pos1+ii-1 + for jj = 1:p + indj = pos1+jj-1 + T[indi,indj] = Ωᵢ[ii,jj] + Tᴴ[indi,indj] = conj(Ωᵢ[jj,ii]) + end + end + + mul!(D, qᵤ', qᵥ) # D = qᵤᴴ * qᵥ + F = lu(D, pivoting) + Φᵢ₊₁, Ψᵢ₊₁ = F.L, F.U # Φᵢ₊₁ = F.P' * Φᵢ₊₁ with pivoting + # vᵢ₊₁ .= (Ψᵢ₊₁' \ qᵥ')' + # uᵢ₊₁ .= (Φᵢ₊₁ \ qᵤ')' + ldiv!(vᵢ₊₁', UpperTriangular(Ψᵢ₊₁)', qᵥ') + ldiv!(uᵢ₊₁', LowerTriangular(Φᵢ₊₁), qᵤ') + Φᵢ = Φᵢ₊₁ + Ψᵢ = Ψᵢ₊₁ + + # Store the blocks Ψᵢ₊₁ and Φᵢ₊₁ in Tₖ₊₁.ₖ + for ii = 1:p + indi = pos3+ii-1 + for jj = 1:p + indj = pos1+jj-1 + (ii ≤ jj) && (T[indi,indj] = Ψᵢ₊₁[ii,jj]) + (ii ≤ jj) && (Tᴴ[indi,indj] = conj(Φᵢ₊₁[jj,ii])) + (ii ≤ jj) && (i < k) && (Tᴴ[indj,indi] = conj(Ψᵢ₊₁[ii,jj])) + (ii ≤ jj) && (i < k) && (T[indj,indi] = Φᵢ₊₁[jj,ii]) + end + end + end + return V, Ψ₁, T, U, Φ₁ᴴ, Tᴴ +end + +""" + V, Γ, H = arnoldi(A, B, k; algo="householder", reorthogonalization=false) + +#### Input arguments + +* `A`: a linear operator that models a square matrix of dimension n; +* `B`: a matrix of size n × p; +* `k`: the number of iterations of the block Arnoldi process. + +#### Keyword arguments + +* `algo`: the algorithm to perform reduced QR factorizations ("gs", "mgs", "givens" or "householder"). +* `reorthogonalization`: reorthogonalize the new matrices of the block Krylov basis against all previous matrices. + +#### Output arguments + +* `V`: a dense n × p(k+1) matrix; +* `Γ`: a dense p × p upper triangular matrix such that V₁Γ = B; +* `H`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p. +""" +function arnoldi(A, B::AbstractMatrix{FC}, k::Int; algo::String="householder", reorthogonalization::Bool=false) where FC <: FloatOrComplex + m, n = size(A) + t, p = size(B) + + V = zeros(FC, n, (k+1)*p) + H = zeros(FC, (k+1)*p, k*p) + + α = -one(FC) + β = one(FC) + q = zeros(FC, n, p) + Γ = zeros(FC, p, p) + Ψᵢⱼ = Ψtmp = zeros(FC, p, p) + + for j = 1:k + pos1 = (j-1)*p + 1 + pos2 = j*p + pos3 = pos1 + p + pos4 = pos2 + p + vⱼ = view(V,:,pos1:pos2) + vⱼ₊₁ = view(V,:,pos3:pos4) + + if j == 1 + q .= B + reduced_qr!(q, Γ, algo) + vⱼ .= q + end + + mul!(q, A, vⱼ) + + for i = 1:j + pos5 = (i-1)*p + 1 + pos6 = i*p + vᵢ = view(V,:,pos5:pos6) + mul!(Ψᵢⱼ, vᵢ', q) # Ψᵢⱼ = vᵢᴴ * q + mul!(q, vᵢ, Ψᵢⱼ, α, β) # q = q - vᵢ * Ψᵢⱼ + H[pos5:pos6,pos1:pos2] .= Ψᵢⱼ + end + + if reorthogonalization + for i = 1:j + pos5 = (i-1)*p + 1 + pos6 = i*p + vᵢ = view(V,:,pos5:pos6) + mul!(Ψtmp, vᵢ', q) # Ψtmp = vᵢᴴ * q + mul!(q, vᵢ, Ψtmp, α, β) # q = q - vᵢ * Ψtmp + Hᵢⱼ = view(H,pos5:pos6,pos1:pos2) + Hᵢⱼ .= Hᵢⱼ .+ Ψtmp + end + end + + Ψ = view(H, pos3:pos4,pos1:pos2) + reduced_qr!(q, Ψ, algo) + vⱼ₊₁ .= q + end + return V, Γ, H +end + +""" + V, U, Ψ, L = golub_kahan(A, B, k; algo="householder") + +#### Input arguments + +* `A`: a linear operator that models a matrix of dimension m × n; +* `B`: a matrix of size m × p; +* `k`: the number of iterations of the block Golub-Kahan process. + +#### Keyword argument + +* `algo`: the algorithm to perform reduced QR factorizations ("gs", "mgs", "givens" or "householder"). + +#### Output arguments + +* `V`: a dense n × p(k+1) matrix; +* `U`: a dense m × p(k+1) matrix; +* `Ψ`: a dense p × p upper triangular matrix such that U₁Ψ = B; +* `L`: a sparse p(k+1) × p(k+1) block lower bidiagonal matrix with a lower bandwidth p. +""" +function golub_kahan(A, B::AbstractMatrix{FC}, k::Int; algo::String="householder") where FC <: FloatOrComplex + m, n = size(A) + t, p = size(B) + Aᴴ = A' + + nnzL = p*k*(p+1) + div(p*(p+1), 2) + colptr = zeros(Int, p*(k+1)+1) + rowval = zeros(Int, nnzL) + nzval = zeros(FC, nnzL) + + colptr[1] = 1 + for j = 1:(k+1)*p + pos = colptr[j] + for i = j:min((k+1)*p,j+p) + rowval[pos] = i + pos += 1 + end + colptr[j+1] = pos + end + + V = zeros(FC, n, (k+1)*p) + U = zeros(FC, m, (k+1)*p) + L = SparseMatrixCSC((k+1)*p, (k+1)*p, colptr, rowval, nzval) + + α = -one(FC) + β = one(FC) + qᵥ = zeros(FC, n, p) + qᵤ = zeros(FC, m, p) + Ψ₁ = zeros(FC, p, p) + Ψᵢ₊₁ = TΩᵢ = TΩᵢ₊₁ = zeros(FC, p, p) + + for i = 1:k + pos1 = (i-1)*p + 1 + pos2 = i*p + pos3 = pos1 + p + pos4 = pos2 + p + vᵢ = view(V,:,pos1:pos2) + vᵢ₊₁ = view(V,:,pos3:pos4) + uᵢ = view(U,:,pos1:pos2) + uᵢ₊₁ = view(U,:,pos3:pos4) + + if i == 1 + qᵤ .= B + reduced_qr!(qᵤ, Ψ₁, algo) + uᵢ .= qᵤ + + mul!(qᵥ, Aᴴ, uᵢ) + reduced_qr!(qᵥ, TΩᵢ, algo) + vᵢ .= qᵥ + + # Store the block Ω₁ in Lₖ₊₁.ₖ₊₁ + for ii = 1:p + indi = pos1+ii-1 + for jj = 1:p + indj = pos1+jj-1 + (ii ≤ jj) && (L[indj,indi] = conj(TΩᵢ[ii,jj])) + end + end + end + + mul!(qᵤ, A, vᵢ) + mul!(qᵤ, uᵢ, TΩᵢ', α, β) # qᵤ = qᵤ - uᵢ * Ωᵢ + + reduced_qr!(qᵤ, Ψᵢ₊₁, algo) + uᵢ₊₁ .= qᵤ + + # Store the block Ψᵢ₊₁ in Lₖ₊₁.ₖ₊₁ + for ii = 1:p + indi = pos3+ii-1 + for jj = 1:p + indj = pos1+jj-1 + (ii ≤ jj) && (L[indi,indj] = Ψᵢ₊₁[ii,jj]) + end + end + + mul!(qᵥ, Aᴴ, uᵢ₊₁) + mul!(qᵥ, vᵢ, Ψᵢ₊₁', α, β) # qᵥ = qᵥ - vᵢ * Ψᵢ₊₁ᴴ + + reduced_qr!(qᵥ, TΩᵢ₊₁, algo) + vᵢ₊₁ .= qᵥ + + # Store the block Ωᵢ₊₁ in Lₖ₊₁.ₖ₊₁ + for ii = 1:p + indi = pos3+ii-1 + for jj = 1:p + indj = pos3+jj-1 + (ii ≤ jj) && (L[indj,indi] = conj(TΩᵢ₊₁[ii,jj])) + end + end + end + return V, U, Ψ₁, L +end + +""" + V, Ψ, T, U, Φᴴ, Tᴴ = saunders_simon_yip(A, B, C, k; algo="householder") + +#### Input arguments + +* `A`: a linear operator that models a matrix of dimension m × n; +* `B`: a matrix of size m × p; +* `C`: a matrix of size n × p; +* `k`: the number of iterations of the block Saunders-Simon-Yip process. + +#### Keyword argument + +* `algo`: the algorithm to perform reduced QR factorizations ("gs", "mgs", "givens" or "householder"). + +#### Output arguments + +* `V`: a dense m × p(k+1) matrix; +* `Ψ`: a dense p × p upper triangular matrix such that V₁Ψ = B; +* `T`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p; +* `U`: a dense n × p(k+1) matrix; +* `Φᴴ`: a dense p × p upper triangular matrix such that U₁Φᴴ = C; +* `Tᴴ`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p. +""" +function saunders_simon_yip(A, B::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int; algo::String="householder") where FC <: FloatOrComplex + m, n = size(A) + t, p = size(B) + Aᴴ = A' + + nnzT = p*p + (k-1)*p*(2*p+1) + div(p*(p+1), 2) + colptr = zeros(Int, p*k+1) + rowval = zeros(Int, nnzT) + nzval_T = zeros(FC, nnzT) + nzval_Tᴴ = zeros(FC, nnzT) + + colptr[1] = 1 + for j = 1:k*p + pos = colptr[j] + for i = max(1, j-p):j+p + rowval[pos] = i + pos += 1 + end + colptr[j+1] = pos + end + + V = zeros(FC, m, (k+1)*p) + U = zeros(FC, n, (k+1)*p) + T = SparseMatrixCSC((k+1)*p, k*p, colptr, rowval, nzval_T) + Tᴴ = SparseMatrixCSC((k+1)*p, k*p, colptr, rowval, nzval_Tᴴ) + + α = -one(FC) + β = one(FC) + qᵥ = zeros(FC, m, p) + qᵤ = zeros(FC, n, p) + Ψ₁ = zeros(FC, p, p) + Φ₁ᴴ = zeros(FC, p, p) + Ωᵢ = Ψᵢ = Ψᵢ₊₁ = TΦᵢ = TΦᵢ₊₁ = zeros(FC, p, p) + + for i = 1:k + pos1 = (i-1)*p + 1 + pos2 = i*p + pos3 = pos1 + p + pos4 = pos2 + p + vᵢ = view(V,:,pos1:pos2) + vᵢ₊₁ = view(V,:,pos3:pos4) + uᵢ = view(U,:,pos1:pos2) + uᵢ₊₁ = view(U,:,pos3:pos4) + + if i == 1 + qᵥ .= B + reduced_qr!(qᵥ, Ψ₁, algo) + vᵢ .= qᵥ + qᵤ .= C + reduced_qr!(qᵤ, Φ₁ᴴ, algo) + uᵢ .= qᵤ + end + + mul!(qᵥ, A, uᵢ) + mul!(qᵤ, Aᴴ, vᵢ) + + if i ≥ 2 + pos5 = pos1 - p + pos6 = pos2 - p + vᵢ₋₁ = view(V,:,pos5:pos6) + uᵢ₋₁ = view(U,:,pos5:pos6) + + mul!(qᵥ, vᵢ₋₁, TΦᵢ', α, β) # qᵥ = qᵥ - vᵢ₋₁ * Φᵢ + for ii = 1:p + indi = pos1+ii-1 + for jj = 1:p + indj = pos5+jj-1 + Ψᵢ[ii,jj] = T[indi,indj] + end + end + mul!(qᵤ, uᵢ₋₁, Ψᵢ', α, β) # qᵤ = qᵤ - uᵢ₋₁ * Ψᵢᴴ + end + + mul!(Ωᵢ, vᵢ', qᵥ) + mul!(qᵥ, vᵢ, Ωᵢ, α, β) # qᵥ = qᵥ - vᵢ * Ωᵢ + mul!(qᵤ, uᵢ, Ωᵢ', α, β) # qᵤ = qᵤ - uᵢ * Ωᵢᴴ + + # Store the block Ωᵢ in Tₖ₊₁.ₖ + for ii = 1:p + indi = pos1+ii-1 + for jj = 1:p + indj = pos1+jj-1 + T[indi,indj] = Ωᵢ[ii,jj] + Tᴴ[indi,indj] = conj(Ωᵢ[jj,ii]) + end + end + + reduced_qr!(qᵥ, Ψᵢ₊₁, algo) + vᵢ₊₁ .= qᵥ + + # Store the block Ψᵢ₊₁ in Tₖ₊₁.ₖ + for ii = 1:p + indi = pos3+ii-1 + for jj = 1:p + indj = pos1+jj-1 + (ii ≤ jj) && (T[indi,indj] = Ψᵢ₊₁[ii,jj]) + (ii ≤ jj) && (i < k) && (Tᴴ[indj,indi] = conj(Ψᵢ₊₁[ii,jj])) + end + end + + reduced_qr!(qᵤ, TΦᵢ₊₁, algo) + uᵢ₊₁ .= qᵤ + + # Store the block Φᵢ₊₁ in Tₖ₊₁.ₖ + for ii = 1:p + indi = pos3+ii-1 + for jj = 1:p + indj = pos1+jj-1 + (ii ≤ jj) && (Tᴴ[indi,indj] = TΦᵢ₊₁[ii,jj]) + (ii ≤ jj) && (i < k) && (T[indj,indi] = conj(TΦᵢ₊₁[ii,jj])) + end + end + end + return V, Ψ₁, T, U, Φ₁ᴴ, Tᴴ +end + +""" + V, Γ, H, U, Λ, F = montoison_orban(A, B, D, C, k; algo="householder", reorthogonalization=false) + +#### Input arguments + +* `A`: a linear operator that models a matrix of dimension m × n; +* `B`: a linear operator that models a matrix of dimension n × m; +* `D`: a matrix of size m × p; +* `C`: a matrix of size n × p; +* `k`: the number of iterations of the block Montoison-Orban process. + +#### Keyword arguments + +* `algo`: the algorithm to perform reduced QR factorizations ("gs", "mgs", "givens" or "householder"). +* `reorthogonalization`: reorthogonalize the new matrices of the block Krylov basis against all previous matrices. + +#### Output arguments + +* `V`: a dense m × p(k+1) matrix; +* `Γ`: a dense p × p upper triangular matrix such that V₁Γ = D; +* `H`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p; +* `U`: a dense n × p(k+1) matrix; +* `Λ`: a dense p × p upper triangular matrix such that U₁Λ = C; +* `F`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p. +""" +function montoison_orban(A, B, D::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int; algo::String="householder", reorthogonalization::Bool=false) where FC <: FloatOrComplex + m, n = size(A) + t, p = size(D) + + V = zeros(FC, m, (k+1)*p) + U = zeros(FC, n, (k+1)*p) + H = zeros(FC, (k+1)*p, k*p) + F = zeros(FC, (k+1)*p, k*p) + + α = -one(FC) + β = one(FC) + qᵥ = zeros(FC, m, p) + qᵤ = zeros(FC, n, p) + Γ = zeros(FC, p, p) + Λ = zeros(FC, p, p) + Ψᵢⱼ = Φᵢⱼ = Ψtmp = Φtmp = zeros(FC, p, p) + + for j = 1:k + pos1 = (j-1)*p + 1 + pos2 = j*p + pos3 = pos1 + p + pos4 = pos2 + p + vⱼ = view(V,:,pos1:pos2) + vⱼ₊₁ = view(V,:,pos3:pos4) + uⱼ = view(U,:,pos1:pos2) + uⱼ₊₁ = view(U,:,pos3:pos4) + + if j == 1 + qᵥ .= D + reduced_qr!(qᵥ, Γ, algo) + vⱼ .= qᵥ + + qᵤ .= C + reduced_qr!(qᵤ, Λ, algo) + uⱼ .= qᵤ + end + + mul!(qᵥ, A, uⱼ) + mul!(qᵤ, B, vⱼ) + + for i = 1:j + pos5 = (i-1)*p + 1 + pos6 = i*p + vᵢ = view(V,:,pos5:pos6) + uᵢ = view(U,:,pos5:pos6) + + mul!(Ψᵢⱼ, vᵢ', qᵥ) # Ψᵢⱼ = vᵢᴴ * qᵥ + mul!(qᵥ, vᵢ, Ψᵢⱼ, α, β) # qᵥ = qᵥ - vᵢ * Ψᵢⱼ + H[pos5:pos6,pos1:pos2] .= Ψᵢⱼ + + mul!(Φᵢⱼ, uᵢ', qᵤ) # Φᵢⱼ = uᵢᴴ * qᵤ + mul!(qᵤ, uᵢ, Φᵢⱼ, α, β) # qᵤ = qᵤ - uᵢ * Φᵢⱼ + F[pos5:pos6,pos1:pos2] .= Φᵢⱼ + end + + if reorthogonalization + for i = 1:j + pos5 = (i-1)*p + 1 + pos6 = i*p + vᵢ = view(V,:,pos5:pos6) + uᵢ = view(U,:,pos5:pos6) + + mul!(Ψtmp, vᵢ', qᵥ) # Ψtmp = vᵢᴴ * qᵥ + mul!(qᵥ, vᵢ, Ψtmp, α, β) # qᵥ = qᵥ - vᵢ * Ψtmp + Hᵢⱼ = view(H,pos5:pos6,pos1:pos2) + Hᵢⱼ .= Hᵢⱼ .+ Ψtmp + + mul!(Φtmp, uᵢ', qᵤ) # Φtmp = uᵢᴴ * qᵤ + mul!(qᵤ, uᵢ, Φtmp, α, β) # qᵤ = qᵤ - uᵢ * Φtmp + Fᵢⱼ = view(F,pos5:pos6,pos1:pos2) + Fᵢⱼ .= Fᵢⱼ .+ Φtmp + end + end + + Ψ = view(H, pos3:pos4,pos1:pos2) + reduced_qr!(qᵥ, Ψ, algo) + vⱼ₊₁ .= qᵥ + + Φ = view(F, pos3:pos4,pos1:pos2) + reduced_qr!(qᵤ, Φ, algo) + uⱼ₊₁ .= qᵤ + end + return V, Γ, H, U, Λ, F +end diff --git a/src/krylov_processes.jl b/src/krylov_processes.jl index ebcf7f513..1541d0ab9 100644 --- a/src/krylov_processes.jl +++ b/src/krylov_processes.jl @@ -30,19 +30,20 @@ function hermitian_lanczos(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOr nzval = zeros(R, 3k-1) colptr[1] = 1 - rowval[1] = 1 - rowval[2] = 2 for i = 1:k + pos = colptr[i] colptr[i+1] = 3i - if i ≥ 2 - pos = colptr[i] + if i == 1 + rowval[pos] = i + rowval[pos+1] = i+1 + else rowval[pos] = i-1 rowval[pos+1] = i rowval[pos+2] = i+1 end end - local β₁ + β₁ = zero(R) V = M(undef, n, k+1) T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval) @@ -98,6 +99,7 @@ end function nonhermitian_lanczos(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex m, n = size(A) Aᴴ = A' + R = real(FC) S = ktypeof(b) M = vector_to_matrix(S) @@ -107,19 +109,20 @@ function nonhermitian_lanczos(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k nzval_Tᴴ = zeros(FC, 3k-1) colptr[1] = 1 - rowval[1] = 1 - rowval[2] = 2 for i = 1:k + pos = colptr[i] colptr[i+1] = 3i - if i ≥ 2 - pos = colptr[i] + if i == 1 + rowval[pos] = i + rowval[pos+1] = i+1 + else rowval[pos] = i-1 rowval[pos+1] = i rowval[pos+2] = i+1 end end - local β₁, γ₁ᴴ + β₁ = γ₁ᴴ = zero(R) V = M(undef, n, k+1) U = M(undef, n, k+1) T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_T) @@ -178,7 +181,7 @@ end * `b`: a vector of length n; * `k`: the number of iterations of the Arnoldi process. -#### Keyword arguments +#### Keyword argument * `reorthogonalization`: reorthogonalize the new vectors of the Krylov basis against all previous vectors. @@ -194,10 +197,11 @@ end """ function arnoldi(A, b::AbstractVector{FC}, k::Int; reorthogonalization::Bool=false) where FC <: FloatOrComplex m, n = size(A) + R = real(FC) S = ktypeof(b) M = vector_to_matrix(S) - local β + β = zero(R) V = M(undef, n, k+1) H = zeros(FC, k+1, k) @@ -261,16 +265,19 @@ function golub_kahan(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComple nzval = zeros(R, 2k+1) colptr[1] = 1 - for i = 1:k + for i = 1:k+1 pos = colptr[i] - colptr[i+1] = pos+2 - rowval[pos] = i - rowval[pos+1] = i+1 + if i ≤ k + colptr[i+1] = pos + 2 + rowval[pos] = i + rowval[pos+1] = i+1 + else + colptr[i+1] = pos + 1 + rowval[pos] = i + end end - rowval[2k+1] = k+1 - colptr[k+2] = 2k+2 - local β₁ + β₁ = zero(R) V = M(undef, n, k+1) U = M(undef, m, k+1) L = SparseMatrixCSC(k+1, k+1, colptr, rowval, nzval) @@ -332,6 +339,7 @@ end function saunders_simon_yip(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex m, n = size(A) Aᴴ = A' + R = real(FC) S = ktypeof(b) M = vector_to_matrix(S) @@ -341,19 +349,20 @@ function saunders_simon_yip(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k:: nzval_Tᴴ = zeros(FC, 3k-1) colptr[1] = 1 - rowval[1] = 1 - rowval[2] = 2 for i = 1:k + pos = colptr[i] colptr[i+1] = 3i - if i ≥ 2 - pos = colptr[i] + if i == 1 + rowval[pos] = i + rowval[pos+1] = i+1 + else rowval[pos] = i-1 rowval[pos+1] = i rowval[pos+2] = i+1 end end - local β₁, γ₁ᴴ + β₁ = γ₁ᴴ = zero(R) V = M(undef, m, k+1) U = M(undef, n, k+1) T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_T) @@ -412,7 +421,7 @@ end * `c`: a vector of length n; * `k`: the number of iterations of the Montoison-Orban process. -#### Keyword arguments +#### Keyword argument * `reorthogonalization`: reorthogonalize the new vectors of the Krylov basis against all previous vectors. @@ -431,10 +440,11 @@ end """ function montoison_orban(A, B, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int; reorthogonalization::Bool=false) where FC <: FloatOrComplex m, n = size(A) + R = real(FC) S = ktypeof(b) M = vector_to_matrix(S) - local β, γ + β = γ = zero(R) V = M(undef, m, k+1) U = M(undef, n, k+1) H = zeros(FC, k+1, k) diff --git a/src/krylov_utils.jl b/src/krylov_utils.jl index 7689c0f94..e06569c4a 100644 --- a/src/krylov_utils.jl +++ b/src/krylov_utils.jl @@ -322,6 +322,47 @@ kaxpby!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer, t kcopy!(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasFloat = BLAS.blascopy!(n, x, dx, y, dy) kcopy!(n :: Integer, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: FloatOrComplex = copyto!(y, x) +# geqrf -- Computes the QR factorization of a general m-by-n matrix. +# orgqr -- Generates the real orthogonal matrix of the QR factorization formed by geqrf. +# ungqr -- Generates the complex unitary matrix of the QR factorization formed by geqrf. +# ormqr -- Multiplies a real matrix by the orthogonal matrix of the QR factorization formed by geqrf. +# unmqr -- Multiplies a complex matrix by the unitary matrix of the QR factorization formed by geqrf. + +# getrf -- Computes the LU factorization of a general m-by-n matrix. +# getrs -- Solves a system of linear equations with an LU-factored square matrix. +# getri -- Computes the inverse of an LU-factored general matrix. + +# sytrf -- Computes the Bunch-Kaufman factorization of a symmetric matrix. +# sytrs -- Solves a system of linear equations with an LDLᵀ-factored symmetric matrix. +# sytri -- Computes the inverse of a symmetric matrix using LDLᵀ Bunch-Kaufman factorization. + +# hetrf -- Computes the Bunch-Kaufman factorization of a complex Hermitian matrix. +# hetrs -- Solves a system of linear equations with an LDLᴴ-factored Hermitian matrix. +# hetri -- Computes the inverse of a complex Hermitian matrix using LDLᴴ Bunch-Kaufman factorization. + +# potrf -- Computes the Cholesky factorization of a symmetric (Hermitian) positive-definite matrix. +# potrs -- Solves a system of linear equations with a Cholesky-factored symmetric (Hermitian) positive-definite matrix. +# potri -- Computes the inverse of a Cholesky-factored symmetric (Hermitian) positive-definite matrix. + +# trtrs -- Solves a system of linear equations with a triangular matrix. +# trtri -- Computes the inverse of a triangular matrix. + +kgeqrf!(A :: Matrix{T}, tau :: Vector{T}) where T <: BLAS.BlasFloat = LAPACK.geqrf!(A, tau) +korgqr!(A :: Matrix{T}, tau :: Vector{T}) where T <: BLAS.BlasFloat = LAPACK.orgqr!(A, tau) +kormqr!(side :: Char, trans :: Char, A :: Matrix{T}, tau :: Vector{T}, C :: Matrix{T}) where T <: BLAS.BlasFloat = LAPACK.ormqr!(side, trans, A, tau, C) + +macro kgeqrf!(A, tau) + return esc(:(Krylov.kgeqrf!($A, $tau))) +end + +macro korgqr!(A, tau) + return esc(:(Krylov.korgqr!($A, $tau))) +end + +macro kormqr!(side, trans, A, tau, C) + return esc(:(Krylov.kormqr!($side, $trans, $A, $tau, $C))) +end + # the macros are just for readability, so we don't have to write the increments (always equal to 1) macro kdot(n, x, y) return esc(:(Krylov.kdot($n, $x, 1, $y, 1))) diff --git a/src/processes_utils.jl b/src/processes_utils.jl new file mode 100644 index 000000000..057cafb9d --- /dev/null +++ b/src/processes_utils.jl @@ -0,0 +1,187 @@ +# """ +# Gram-Schmidt orthogonalization for a reduced QR decomposition: +# Q, R = gs(A) +# +# Input : +# A an n-by-k matrix, n ≥ k +# +# Output : +# Q an n-by-k orthonormal matrix: QᴴQ = Iₖ +# R an k-by-k upper triangular matrix: QR = A +# """ +function gs(A::AbstractMatrix{FC}) where FC <: FloatOrComplex + n, k = size(A) + Q = copy(A) + R = zeros(FC, k, k) + v = zeros(FC, n) + gs!(Q, R, v) +end + +function gs!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, v::AbstractVector{FC}) where FC <: FloatOrComplex + n, k = size(Q) + aⱼ = v + R .= zero(FC) + for j = 1:k + qⱼ = view(Q,:,j) + aⱼ .= qⱼ + for i = 1:j-1 + qᵢ = view(Q,:,i) + R[i,j] = @kdot(n, qᵢ, aⱼ) # rᵢⱼ = ⟨qᵢ , aⱼ⟩ + @kaxpy!(n, -R[i,j], qᵢ, qⱼ) # qⱼ = qⱼ - rᵢⱼqᵢ + end + R[j,j] = @knrm2(n, qⱼ) # rⱼⱼ = ‖qⱼ‖ + qⱼ ./= R[j,j] # qⱼ = qⱼ / rⱼⱼ + end + return Q, R +end + +# """ +# Modified Gram-Schmidt orthogonalization for a reduced QR decomposition: +# Q, R = mgs(A) +# +# Input : +# A an n-by-k matrix, n ≥ k +# +# # Q an n-by-k orthonormal matrix: QᴴQ = Iₖ +# # R an k-by-k upper triangular matrix: QR = A +# """ +function mgs(A::AbstractMatrix{FC}) where FC <: FloatOrComplex + n, k = size(A) + Q = copy(A) + R = zeros(FC, k, k) + mgs!(Q, R) +end + +function mgs!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}) where FC <: FloatOrComplex + n, k = size(Q) + R .= zero(FC) + for i = 1:k + qᵢ = view(Q,:,i) + R[i,i] = @knrm2(n, qᵢ) # rᵢᵢ = ‖qᵢ‖ + qᵢ ./= R[i,i] # qᵢ = qᵢ / rᵢᵢ + for j = i+1:k + qⱼ = view(Q,:,j) + R[i,j] = @kdot(n, qᵢ, qⱼ) # rᵢⱼ = ⟨qᵢ , qⱼ⟩ + @kaxpy!(n, -R[i,j], qᵢ, qⱼ) # qⱼ = qⱼ - rᵢⱼqᵢ + end + end + return Q, R +end + +# Reduced QR factorization with Householder reflections: +# Q, R = householder(A) +# +# Input : +# A an n-by-k matrix, n ≥ k +# +# Output : +# Q an n-by-k orthonormal matrix: QᴴQ = Iₖ +# R an k-by-k upper triangular matrix: QR = A +function householder(A::AbstractMatrix{FC}) where FC <: FloatOrComplex + n, k = size(A) + Q = copy(A) + τ = zeros(FC, k) + R = zeros(FC, k, k) + householder!(Q, R, τ) +end + +function householder!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, τ::AbstractVector{FC}) where FC <: FloatOrComplex + n, k = size(Q) + R .= zero(FC) + @kgeqrf!(Q, τ) + for i = 1:k + for j = i:k + R[i,j] = Q[i,j] + end + end + @korgqr!(Q, τ) + return Q, R +end + +# Reduced QR factorization with Givens reflections: +# Q, R = givens(A) +# +# Input : +# A an n-by-k matrix, n ≥ k +# +# # Q an n-by-k orthonormal matrix: QᴴQ = Iₖ +# # R an k-by-k upper triangular matrix: QR = A +# """ +function givens(A::AbstractMatrix{FC}) where FC <: FloatOrComplex + n, k = size(A) + nr = n*k - div(k*(k+1), 2) + T = real(FC) + Q = copy(A) + R = zeros(FC, k, k) + C = zeros(T, nr) + S = zeros(FC, nr) + givens!(Q, R, C, S) +end + +function givens!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, C::AbstractVector{T}, S::AbstractVector{FC}) where {T <: AbstractFloat, FC <: FloatOrComplex{T}} + n, k = size(Q) + R .= zero(FC) + pos = 0 + for j = 1:k + for i = n-1:-1:j + pos += 1 + C[pos], S[pos], Q[i,j] = sym_givens(Q[i,j], Q[i+1,j]) + if j < k + reflect!(view(Q, i, j+1:k), view(Q, i+1, j+1:k), C[pos], S[pos]) + end + end + end + for j = 1:k + for i = 1:j + R[i,j] = Q[i,j] + end + end + Q .= zero(FC) + for i = 1:k + Q[i,i] = one(FC) + end + for j = k:-1:1 + for i = j:n-1 + reflect!(view(Q, i, j:k), view(Q, i+1, j:k), C[pos], S[pos]) + pos -= 1 + end + end + return Q, R +end + +function reduced_qr!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, algo::String) where FC <: FloatOrComplex + n, k = size(Q) + T = real(FC) + if algo == "gs" + v = zeros(FC, n) + gs!(Q, R, v) + elseif algo == "mgs" + mgs!(Q, R) + elseif algo == "givens" + nr = n*k - div(k*(k+1), 2) + C = zeros(T, nr) + S = zeros(FC, nr) + givens!(Q, R, C, S) + elseif algo == "householder" + τ = zeros(FC, k) + householder!(Q, R, τ) + else + error("$algo is not a supported method to perform a reduced QR.") + end + return Q, R +end + +function reduced_qr(A::AbstractMatrix{FC}, algo::String) where FC <: FloatOrComplex + if algo == "gs" + Q, R = gs(A) + elseif algo == "mgs" + Q, R = mgs(A) + elseif algo == "givens" + Q, R = givens(A) + elseif algo == "householder" + Q, R = householder(A) + else + error("$algo is not a supported method to perform a reduced QR.") + end + return Q, R +end diff --git a/test/runtests.jl b/test/runtests.jl index e2fdd6088..040d2edd1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,9 +1,12 @@ using Krylov, LinearAlgebra, SparseArrays, Printf, Random, Test import Krylov.KRYLOV_SOLVERS +Random.seed!(666) + include("test_utils.jl") include("test_aux.jl") include("test_stats.jl") +include("test_block_processes.jl") include("test_processes.jl") include("test_minares.jl") diff --git a/test/test_block_processes.jl b/test/test_block_processes.jl new file mode 100644 index 000000000..2297dc238 --- /dev/null +++ b/test/test_block_processes.jl @@ -0,0 +1,234 @@ +@testset "Block processes" begin + m = 250 + n = 500 + k = 20 + p = 4 + s = 4 + verbose = false + + for FC in (Float64, ComplexF64) + nbits_FC = sizeof(FC) + nbits_I = sizeof(Int) + + @testset "Data Type: $FC" begin + + @testset "Block Hermitian Lanczos" begin + A = rand(FC, n, n) + A = A' * A + B = rand(FC, n, p) + + @testset "algo = $algo" for algo in ("householder", "gs", "mgs", "givens") + V, Ψ₁, T = hermitian_lanczos(A, B, k; algo) + + @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test V[:,1:p] * Ψ₁ ≈ B + @test A * V[:,1:p*k] ≈ V * T + end + + @testset "memory" begin + function storage_block_hermitian_lanczos_bytes(n, p, k) + nnzT = p*p + (k-1)*p*(2*p+1) + div(p*(p+1), 2) + memory = 0 + memory += (p*k+1) * nbits_I # Tₖ₊₁.ₖ -- colptr + memory += nnzT * nbits_I # Tₖ₊₁.ₖ -- rowval + memory += nnzT * nbits_FC # Tₖ₊₁.ₖ -- nzval + memory += p*p * nbits_FC # Ψ₁ + memory += p*p * nbits_FC # Ωᵢ, Ψᵢ and Ψᵢ₊₁ + memory += n*p * (k+1) * nbits_FC # Vₖ₊₁ + memory += p*n * nbits_FC # q + return memory + end + + expected_block_hermitian_lanczos_bytes = storage_block_hermitian_lanczos_bytes(n, p, k) + actual_block_hermitian_lanczos_bytes = @allocated hermitian_lanczos(A, B, k; algo="mgs") + verbose && println("Block Hermitian Lanczos | $FC") + verbose && println(expected_block_hermitian_lanczos_bytes, " ≤ ", actual_block_hermitian_lanczos_bytes, " ≤ ", 1.02 * expected_block_hermitian_lanczos_bytes, " ?") + verbose && println() + @test expected_block_hermitian_lanczos_bytes ≤ actual_block_hermitian_lanczos_bytes ≤ 1.02 * expected_block_hermitian_lanczos_bytes + end + end + + @testset "Block Non-Hermitian Lanczos" begin + A = rand(FC, n, n) + B = rand(FC, n, p) + C = rand(FC, n, p) + V, Ψ₁, T, U, Φ₁ᴴ, Tᴴ = nonhermitian_lanczos(A, B, C, k) + + @test norm(V[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4 + @test norm(U[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test V[:,1:p] * Ψ₁ ≈ B + @test U[:,1:p] * Φ₁ᴴ ≈ C + @test T[1:k*p,1:k*p] ≈ Tᴴ[1:k*p,1:k*p]' + @test A * V[:,1:k*p] ≈ V * T + @test A' * U[:,1:k*p] ≈ U * Tᴴ + + @testset "memory" begin + # storage_block_nonhermitian_lanczos_bytes(n, p, k) = ... + # + # expected_block_nonhermitian_lanczos_bytes = storage_block_nonhermitian_lanczos_bytes(n, p, k) + # actual_block_nonhermitian_lanczos_bytes = @allocated nonhermitian_lanczos(A, B, C, k) + # verbose && println("Block non-Hermitian Lanczos") + # verbose && println(expected_block_nonhermitian_lanczos_bytes, " ≤ ", actual_block_nonhermitian_lanczos_bytes, " ≤ ", 1.02 * expected_block_nonhermitian_lanczos_bytes, " ?") + # verbose && println() + # @test expected_block_nonhermitian_lanczos_bytes ≤ actual_block_nonhermitian_lanczos_bytes ≤ 1.02 * expected_block_nonhermitian_lanczos_bytes + end + end + + @testset "Block Arnoldi" begin + A = rand(FC, n, n) + B = rand(FC, n, p) + + @testset "algo = $algo" for algo in ("householder", "gs", "mgs", "givens") + @testset "reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true) + V, Γ, H = arnoldi(A, B, k; algo, reorthogonalization) + + @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test V[:,1:p] * Γ ≈ B + @test A * V[:,1:p*k] ≈ V * H + end + end + + @testset "memory -- reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true) + function storage_block_arnoldi_bytes(n, p, k) + memory = 0 + memory += p*n * (k+1) * nbits_FC # Vₖ₊₁ + memory += n*p * nbits_FC # q + memory += p*k * p*(k+1) * nbits_FC # Hₖ₊₁.ₖ + memory += p*p * nbits_FC # Γ + memory += p*p * nbits_FC # Ψᵢⱼ and Ψtmp + return memory + end + + expected_block_arnoldi_bytes = storage_block_arnoldi_bytes(n, p, k) + actual_block_arnoldi_bytes = @allocated arnoldi(A, B, k; algo="mgs", reorthogonalization) + verbose && println("Block Arnoldi | $FC") + verbose && println(expected_block_arnoldi_bytes, " ≤ ", actual_block_arnoldi_bytes, " ≤ ", 1.02 * expected_block_arnoldi_bytes, " ?") + verbose && println() + @test expected_block_arnoldi_bytes ≤ actual_block_arnoldi_bytes ≤ 1.02 * expected_block_arnoldi_bytes + end + end + + @testset "Block Golub-Kahan" begin + A = rand(FC, m, n) + B = rand(FC, m, p) + + @testset "algo = $algo" for algo in ("householder", "gs", "mgs", "givens") + V, U, Ψ₁, L = golub_kahan(A, B, k; algo) + BL = L[1:(k+1)*p,1:k*p] + + @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4 + @test U[:,1:p] * Ψ₁ ≈ B + @test A * V[:,1:k*p] ≈ U * BL + @test A' * U ≈ V * L' + @test A' * A * V[:,1:k*p] ≈ V * L' * BL + @test A * A' * U[:,1:k*p] ≈ U * BL * L[1:k*p,1:k*p]' + end + + @testset "memory" begin + function storage_block_golub_kahan_bytes(m, n, p, k) + nnzL = p*k*(p+1) + div(p*(p+1), 2) + memory = 0 + memory += (p*(k+1)+1) * nbits_I # Lₖ₊₁ -- colptr + memory += nnzL * nbits_I # Lₖ₊₁ -- rowval + memory += nnzL * nbits_FC # Lₖ₊₁ -- nzval + memory += p*p * nbits_FC # Ψ₁ + memory += p*p * nbits_FC # Ψᵢ₊₁, TΩᵢ and TΩᵢ₊₁ + memory += p*(n+m) * (k+1) * nbits_FC # Vₖ₊₁ and Uₖ₊₁ + memory += p*(n+m) * nbits_FC # qᵥ and qᵤ + return memory + end + + expected_block_golub_kahan_bytes = storage_block_golub_kahan_bytes(m, n, p, k) + actual_block_golub_kahan_bytes = @allocated golub_kahan(A, B, k; algo="mgs") + verbose && println("Block Golub-Kahan | $FC") + verbose && println(expected_block_golub_kahan_bytes, " ≤ ", actual_block_golub_kahan_bytes, " ≤ ", 1.02 * expected_block_golub_kahan_bytes, " ?") + verbose && println() + @test expected_block_golub_kahan_bytes ≤ actual_block_golub_kahan_bytes ≤ 1.02 * expected_block_golub_kahan_bytes + end + end + + @testset "Block Saunders-Simon-Yip" begin + A = rand(FC, m, n) + B = rand(FC, m, p) + C = rand(FC, n, p) + + @testset "algo = $algo" for algo in ("householder", "gs", "mgs", "givens") + V, Ψ₁, T, U, Φ₁ᴴ, Tᴴ = saunders_simon_yip(A, B, C, k; algo) + + @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4 + @test V[:,1:p] * Ψ₁ ≈ B + @test U[:,1:p] * Φ₁ᴴ ≈ C + @test T[1:k*p,1:k*p] ≈ Tᴴ[1:k*p,1:k*p]' + @test A * U[:,1:k*p] ≈ V * T + @test A' * V[:,1:k*p] ≈ U * Tᴴ + @test A' * A * U[:,1:(k-1)*p] ≈ U * Tᴴ * T[1:k*p,1:(k-1)*p] + @test A * A' * V[:,1:(k-1)*p] ≈ V * T * Tᴴ[1:k*p,1:(k-1)*p] + end + + @testset "memory" begin + function storage_block_saunders_simon_yip_bytes(m, n, p, k) + nnzT = p*p + (k-1)*p*(2*p+1) + div(p*(p+1), 2) + memory = 0 + memory += (p*k+1) * nbits_I # Tₖ₊₁.ₖ and (Tₖ.ₖ₊₁)ᴴ -- colptr + memory += nnzT * nbits_I # Tₖ₊₁.ₖ and (Tₖ.ₖ₊₁)ᴴ -- rowval + memory += 2 * nnzT * nbits_FC # Tₖ₊₁.ₖ and (Tₖ.ₖ₊₁)ᴴ -- nzval + memory += 2 * p*p * nbits_FC # Ψ₁ and Φ₁ᴴ + memory += p*p * nbits_FC # Ωᵢ, Ψᵢ, Ψᵢ₊₁, TΦᵢ and TΦᵢ₊₁ + memory += p*(n+m) * (k+1) * nbits_FC # Vₖ₊₁ and Uₖ₊₁ + memory += p*(n+m) * nbits_FC # qᵥ and qᵤ + end + + expected_block_saunders_simon_yip_bytes = storage_block_saunders_simon_yip_bytes(m, n, p, k) + actual_block_saunders_simon_yip_bytes = @allocated saunders_simon_yip(A, B, C, k; algo="mgs") + verbose && println("Block Saunders-Simon-Yip") + verbose && println(expected_block_saunders_simon_yip_bytes, " ≤ ", actual_block_saunders_simon_yip_bytes, " ≤ ", 1.02 * expected_block_saunders_simon_yip_bytes, " ?") + verbose && println() + @test expected_block_saunders_simon_yip_bytes ≤ actual_block_saunders_simon_yip_bytes ≤ 1.02 * expected_block_saunders_simon_yip_bytes + end + end + + @testset "Block Montoison-Orban" begin + A = rand(FC, m, n) + B = rand(FC, n, m) + D = rand(FC, m, p) + C = rand(FC, n, p) + + @testset "algo = $algo" for algo in ("householder", "gs", "mgs", "givens") + @testset "reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true) + V, Γ, H, U, Λ, F = montoison_orban(A, B, D, C, k; algo, reorthogonalization) + + @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4 + @test V[:,1:p] * Γ ≈ D + @test U[:,1:p] * Λ ≈ C + @test A * U[:,1:k*p] ≈ V * H + @test B * V[:,1:k*p] ≈ U * F + @test B * A * U[:,1:(k-1)*p] ≈ U * F * H[1:k*p,1:(k-1)*p] + @test A * B * V[:,1:(k-1)*p] ≈ V * H * F[1:k*p,1:(k-1)*p] + end + end + + @testset "memory -- reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true) + function storage_block_montoison_orban_bytes(m, n, p, k) + memory = 0 + memory += 2 * p*p * nbits_FC # Γ and Λ + memory += p*p * nbits_FC # Ψᵢⱼ, Φᵢⱼ, Ψtmp and Φtmp + memory += 2 * p*k * p*(k+1) * nbits_FC # Hₖ₊₁.ₖ and Fₖ₊₁.ₖ + memory += p*(n+m) * (k+1) * nbits_FC # Vₖ₊₁ and Uₖ₊₁ + memory += p*(n+m)* nbits_FC # qᵥ and qᵤ + return memory + end + + expected_block_montoison_orban_bytes = storage_block_montoison_orban_bytes(m, n, p, k) + actual_block_montoison_orban_bytes = @allocated montoison_orban(A, B, D, C, k; algo="mgs", reorthogonalization) + verbose && println("Block Montoison-Orban | $FC") + verbose && println(expected_block_montoison_orban_bytes, " ≤ ", actual_block_montoison_orban_bytes, " ≤ ", 1.02 * expected_block_montoison_orban_bytes, " ?") + verbose && println() + @test expected_block_montoison_orban_bytes ≤ actual_block_montoison_orban_bytes ≤ 1.02 * expected_block_montoison_orban_bytes + end + end + end + end +end diff --git a/test/test_cr.jl b/test/test_cr.jl index 5ae0774bd..45366ce45 100644 --- a/test/test_cr.jl +++ b/test/test_cr.jl @@ -23,7 +23,6 @@ # Sparse Laplacian A, _ = sparse_laplacian(FC=FC) - Random.seed!(0) b = randn(size(A, 1)) itmax = 0 # case: ‖x*‖ > Δ diff --git a/test/test_processes.jl b/test/test_processes.jl index 237f70ce5..05c0f4162 100644 --- a/test/test_processes.jl +++ b/test/test_processes.jl @@ -20,7 +20,7 @@ end n = 500 k = 20 s = 5 - + for FC in (Float64, ComplexF64) R = real(FC) nbits_FC = sizeof(FC) @@ -28,7 +28,7 @@ end nbits_I = sizeof(Int) @testset "Data Type: $FC" begin - + @testset "Hermitian Lanczos" begin A = rand(FC, n, n) A = A' * A