diff --git a/README.md b/README.md
index 86857dbf3..2d2abf35b 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Krylov.jl: A Julia basket of hand-picked Krylov methods
 
 | **Documentation** | **CI** | **Coverage** | **DOI** | **Downloads** |
-|:-----------------:|:-------------------------------:|:------------:|:-------:|:-------------:|
+|:-----------------:|:------:|:------------:|:-------:|:-------------:|
 | [![docs-stable][docs-stable-img]][docs-stable-url] [![docs-dev][docs-dev-img]][docs-dev-url] | [![build-gh][build-gh-img]][build-gh-url] [![build-cirrus][build-cirrus-img]][build-cirrus-url] | [![codecov][codecov-img]][codecov-url] | [![doi][doi-img]][doi-url] | [![downloads][downloads-img]][downloads-url] |
 
 [docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg
diff --git a/docs/make.jl b/docs/make.jl
index 132b8bbd2..0595a9398 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -12,6 +12,7 @@ makedocs(
   pages = ["Home" => "index.md",
            "API" => "api.md",
            "Krylov processes" => "processes.md",
+           "Block Krylov processes" => "block_processes.md",
            "Krylov methods" => ["Hermitian positive definite linear systems" => "solvers/spd.md",
                                 "Hermitian indefinite linear systems" => "solvers/sid.md",
                                 "Non-Hermitian square linear systems" => "solvers/unsymmetric.md",
diff --git a/docs/src/block_processes.md b/docs/src/block_processes.md
new file mode 100644
index 000000000..d6e3e5995
--- /dev/null
+++ b/docs/src/block_processes.md
@@ -0,0 +1,227 @@
+# [Block Krylov processes](@id block-krylov-processes)
+
+## [Block Hermitian Lanczos](@id block-hermitian-lanczos)
+
+If the vector $b$ in the [Hermitian Lanczos](@ref hermitian-lanczos) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Hermitian Lanczos process.
+
+![block_hermitian_lanczos](./graphics/block_hermitian_lanczos.png)
+
+After $k$ iterations of the block Hermitian Lanczos process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_{k+1} T_{k+1,k}, \\
+  V_k^H V_k &= I_{pk},
+\end{align*}
+```
+where $V_k$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}_k^{\square}(A,B)$,
+```math
+T_{k+1,k} =
+\begin{bmatrix}
+  \Omega_1 & \Psi_2^H &        &          \\
+  \Psi_2   & \Omega_2 & \ddots &          \\
+           & \ddots   & \ddots & \Psi_k^H \\
+           &          & \Psi_k & \Omega_k \\
+           &          &        & \Psi_{k+1}
+\end{bmatrix}.
+```
+
+The function [`hermitian_lanczos`](@ref hermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Psi_1$ and $T_{k+1,k}$.
+
+```@docs
+hermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Block Non-Hermitian Lanczos](@id block-nonhermitian-lanczos)
+
+If the vectors $b$ and $c$ in the [non-Hermitian Lanczos](@ref nonhermitian-lanczos) process are replaced by matrices $B$ and $C$ with both $p$ columns, we can derive the block non-Hermitian Lanczos process.
+
+![block_nonhermitian_lanczos](./graphics/block_nonhermitian_lanczos.png)
+
+After $k$ iterations of the block non-Hermitian Lanczos process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_{k+1} T_{k+1,k}, \\
+  A^H U_k &= U_{k+1} T_{k,k+1}^H, \\
+  V_k^H U_k &= U_k^H V_k = I_{pk},
+\end{align*}
+```
+where $V_k$ and $U_k$ are bases of the block Krylov subspaces $\mathcal{K}^{\square}_k(A,B)$ and $\mathcal{K}^{\square}_k (A^H,C)$, respectively,
+```math
+T_{k+1,k} =
+\begin{bmatrix}
+  \Omega_1 & \Phi_2   &        &          \\
+  \Psi_2   & \Omega_2 & \ddots &          \\
+           & \ddots   & \ddots & \Phi_k   \\
+           &          & \Psi_k & \Omega_k \\
+           &          &        & \Psi_{k+1}
+\end{bmatrix}
+, \qquad
+T_{k,k+1}^H =
+\begin{bmatrix}
+  \Omega_1^H & \Psi_2^H   &          &            \\
+  \Phi_2^H   & \Omega_2^H & \ddots   &            \\
+             & \ddots     & \ddots   & \Psi_k^H   \\
+             &            & \Phi_k^H & \Omega_k^H \\
+             &            &          & \Phi_{k+1}^H
+\end{bmatrix}.
+```
+
+The function [`nonhermitian_lanczos`](@ref nonhermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Psi_1$, $T_{k+1,k}$, $U_{k+1}$ $\Phi_1^H$ and $T_{k,k+1}^H$.
+
+```@docs
+nonhermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Block Arnoldi](@id block-arnoldi)
+
+If the vector $b$ in the [Arnoldi](@ref arnoldi) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Arnoldi process.
+
+![block_arnoldi](./graphics/block_arnoldi.png)
+
+After $k$ iterations of the block Arnoldi process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_{k+1} H_{k+1,k}, \\
+  V_k^H V_k &= I_{pk},
+\end{align*}
+```
+where $V_k$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}_k^{\square}(A,B)$,
+```math
+H_{k+1,k} =
+\begin{bmatrix}
+  \Psi_{1,1}~ & \Psi_{1,2}~ & \ldots       & \Psi_{1,k}   \\
+  \Psi_{2,1}~ & \ddots~     & \ddots       & \vdots       \\
+              & \ddots~     & \ddots       & \Psi_{k-1,k} \\
+              &             & \Psi_{k,k-1} & \Psi_{k,k}   \\
+              &             &              & \Psi_{k+1,k}
+\end{bmatrix}.
+```
+
+The function [`arnoldi`](@ref arnoldi(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Gamma$, and $H_{k+1,k}$.
+
+```@docs
+arnoldi(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Block Golub-Kahan](@id block-golub-kahan)
+
+If the vector $b$ in the [Golub-Kahan](@ref golub-kahan) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Golub-Kahan process.
+
+![block_golub_kahan](./graphics/block_golub_kahan.png)
+
+After $k$ iterations of the block Golub-Kahan process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= U_{k+1} B_k, \\
+  A^H U_{k+1} &= V_{k+1} L_{k+1}^H, \\
+  V_k^H V_k &= U_k^H U_k = I_{pk},
+\end{align*}
+```
+where $V_k$ and $U_k$ are bases of the block Krylov subspaces $\mathcal{K}_k^{\square}(A^HA,A^HB)$ and $\mathcal{K}_k^{\square}(AA^H,B)$, respectively,
+```math
+B_k =
+\begin{bmatrix}
+  \Omega_1 &          &        &            \\
+  \Psi_2   & \Omega_2 &        &            \\
+           & \ddots   & \ddots &            \\
+           &          & \Psi_k & \Omega_k   \\
+           &          &        & \Psi_{k+1} \\
+\end{bmatrix}
+, \qquad
+L_{k+1}^H =
+\begin{bmatrix}
+  \Omega_1^H & \Psi_2^H   &        &            &                \\
+             & \Omega_2^H & \ddots &            &                \\
+             &            & \ddots & \Psi_k^H   &                \\
+             &            &        & \Omega_k^H & \Psi_{k+1}^H   \\
+             &            &        &            & \Omega_{k+1}^H \\
+\end{bmatrix}.
+```
+
+The function [`golub_kahan`](@ref golub_kahan(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $U_{k+1}$, $\Psi_1$ and $L_{k+1}$.
+
+```@docs
+golub_kahan(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Block Saunders-Simon-Yip](@id block-saunders-simon-yip)
+
+If the vectors $b$ and $c$ in the [Saunders-Simon-Yip](@ref saunders-simon-yip) process are replaced by matrices $B$ and $C$ with both $p$ columns, we can derive the block Saunders-Simon-Yip process.
+
+![block_saunders_simon_yip](./graphics/block_saunders_simon_yip.png)
+
+After $k$ iterations of the block Saunders-Simon-Yip process, the situation may be summarized as
+```math
+\begin{align*}
+  A U_k &= V_{k+1} T_{k+1,k}, \\
+  A^H V_k &= U_{k+1} T_{k,k+1}^H, \\
+  V_k^H V_k &= U_k^H U_k = I_{pk},
+\end{align*}
+```
+where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}, \begin{bmatrix} B & 0 \\ 0 & C \end{bmatrix}\right)$,
+```math
+T_{k+1,k} =
+\begin{bmatrix}
+  \Omega_1 & \Phi_2   &        &          \\
+  \Psi_2   & \Omega_2 & \ddots &          \\
+           & \ddots   & \ddots & \Phi_k   \\
+           &          & \Psi_k & \Omega_k \\
+           &          &        & \Psi_{k+1}
+\end{bmatrix}
+, \qquad
+T_{k,k+1}^H =
+\begin{bmatrix}
+  \Omega_1^H & \Psi_2^H   &          &            \\
+  \Phi_2^H   & \Omega_2^H & \ddots   &            \\
+             & \ddots     & \ddots   & \Psi_k^H   \\
+             &            & \Phi_k^H & \Omega_k^H \\
+             &            &          & \Phi_{k+1}^H
+\end{bmatrix}.
+```
+
+The function [`saunders_simon_yip`](@ref saunders_simon_yip(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Psi_1$, $T_{k+1,k}$, $U_{k+1}$, $\Phi_1^H$ and $T_{k,k+1}^H$.
+
+```@docs
+saunders_simon_yip(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Block Montoison-Orban](@id block-montoison-orban)
+
+If the vectors $b$ and $c$ in the [Montoison-Orban](@ref montoison-orban) process are replaced by matrices $D$ and $C$ with both $p$ columns, we can derive the block Montoison-Orban process.
+
+![block_montoison_orban](./graphics/block_montoison_orban.png)
+
+After $k$ iterations of the block Montoison-Orban process, the situation may be summarized as
+```math
+\begin{align*}
+  A U_k &= V_{k+1} H_{k+1,k}, \\
+  B V_k &= U_{k+1} F_{k+1,k}, \\
+  V_k^H V_k &= U_k^H U_k = I_{pk},
+\end{align*}
+```
+where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ B & 0 \end{bmatrix}, \begin{bmatrix} D & 0 \\ 0 & C \end{bmatrix}\right)$,
+```math
+H_{k+1,k} =
+\begin{bmatrix}
+  \Psi_{1,1}~ & \Psi_{1,2}~ & \ldots       & \Psi_{1,k}   \\
+  \Psi_{2,1}~ & \ddots~     & \ddots       & \vdots       \\
+              & \ddots~     & \ddots       & \Psi_{k-1,k} \\
+              &             & \Psi_{k,k-1} & \Psi_{k,k}   \\
+              &             &              & \Psi_{k+1,k}
+\end{bmatrix}
+, \qquad
+F_{k+1,k} =
+\begin{bmatrix}
+  \Phi_{1,1}~ & \Phi_{1,2}~ & \ldots       & \Phi_{1,k}   \\
+  \Phi_{2,1}~ & \ddots~     & \ddots       & \vdots       \\
+              & \ddots~     & \ddots       & \Phi_{k-1,k} \\
+              &             & \Phi_{k,k-1} & \Phi_{k,k}   \\
+              &             &              & \Phi_{k+1,k}
+\end{bmatrix}.
+```
+
+The function [`montoison_orban`](@ref montoison_orban(::Any, ::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)) returns $V_{k+1}$, $\Gamma$, $H_{k+1,k}$, $U_{k+1}$, $\Lambda$, and $F_{k+1,k}$.
+
+```@docs
+montoison_orban(::Any, ::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
diff --git a/docs/src/graphics/block_arnoldi.png b/docs/src/graphics/block_arnoldi.png
new file mode 100644
index 000000000..5248046ea
Binary files /dev/null and b/docs/src/graphics/block_arnoldi.png differ
diff --git a/docs/src/graphics/block_golub_kahan.png b/docs/src/graphics/block_golub_kahan.png
new file mode 100644
index 000000000..e749edc54
Binary files /dev/null and b/docs/src/graphics/block_golub_kahan.png differ
diff --git a/docs/src/graphics/block_hermitian_lanczos.png b/docs/src/graphics/block_hermitian_lanczos.png
new file mode 100644
index 000000000..d16647a84
Binary files /dev/null and b/docs/src/graphics/block_hermitian_lanczos.png differ
diff --git a/docs/src/graphics/block_montoison_orban.png b/docs/src/graphics/block_montoison_orban.png
new file mode 100644
index 000000000..48d4d754c
Binary files /dev/null and b/docs/src/graphics/block_montoison_orban.png differ
diff --git a/docs/src/graphics/block_nonhermitian_lanczos.png b/docs/src/graphics/block_nonhermitian_lanczos.png
new file mode 100644
index 000000000..a18052224
Binary files /dev/null and b/docs/src/graphics/block_nonhermitian_lanczos.png differ
diff --git a/docs/src/graphics/block_saunders_simon_yip.png b/docs/src/graphics/block_saunders_simon_yip.png
new file mode 100644
index 000000000..96dcc4615
Binary files /dev/null and b/docs/src/graphics/block_saunders_simon_yip.png differ
diff --git a/src/Krylov.jl b/src/Krylov.jl
index 05069d14b..e2eb70952 100644
--- a/src/Krylov.jl
+++ b/src/Krylov.jl
@@ -5,7 +5,10 @@ using LinearAlgebra, SparseArrays, Printf
 include("krylov_utils.jl")
 include("krylov_stats.jl")
 include("krylov_solvers.jl")
+
+include("processes_utils.jl")
 include("krylov_processes.jl")
+include("block_krylov_processes.jl")
 
 include("cg.jl")
 include("cr.jl")
diff --git a/src/block_krylov_processes.jl b/src/block_krylov_processes.jl
new file mode 100644
index 000000000..3848de063
--- /dev/null
+++ b/src/block_krylov_processes.jl
@@ -0,0 +1,666 @@
+"""
+    V, Ψ, T = hermitian_lanczos(A, B, k; algo="householder")
+
+#### Input arguments
+
+* `A`: a linear operator that models an Hermitian matrix of dimension n;
+* `B`: a matrix of size n × p;
+* `k`: the number of iterations of the block Hermitian Lanczos process.
+
+#### Keyword arguments
+
+* `algo`: the algorithm to perform reduced QR factorizations ("gs", "mgs", "givens" or "householder").
+
+#### Output arguments
+
+* `V`: a dense n × p(k+1) matrix;
+* `Ψ`: a dense p × p upper triangular matrix such that V₁Ψ = B;
+* `T`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p.
+"""
+function hermitian_lanczos(A, B::AbstractMatrix{FC}, k::Int; algo::String="householder") where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(B)
+
+  nnzT = p*p + (k-1)*p*(2*p+1) + div(p*(p+1), 2)
+  colptr = zeros(Int, p*k+1)
+  rowval = zeros(Int, nnzT)
+  nzval = zeros(FC, nnzT)
+
+  colptr[1] = 1
+  for j = 1:k*p
+    pos = colptr[j]
+    for i = max(1, j-p):j+p
+      rowval[pos] = i
+      pos += 1
+    end
+    colptr[j+1] = pos
+  end
+
+  V = zeros(FC, n, (k+1)*p)
+  T = SparseMatrixCSC((k+1)*p, k*p, colptr, rowval, nzval)
+
+  α = -one(FC)
+  β = one(FC)
+  q = zeros(FC, n, p)
+  ψ₁ = zeros(FC, p, p)
+  Ωᵢ = Ψᵢ = Ψᵢ₊₁ = zeros(FC, p, p)
+
+  for i = 1:k
+    pos1 = (i-1)*p + 1
+    pos2 = i*p
+    pos3 = pos1 + p
+    pos4 = pos2 + p
+    vᵢ = view(V,:,pos1:pos2)
+    vᵢ₊₁ = view(V,:,pos3:pos4)
+
+    if i == 1
+      q .= B
+      reduced_qr!(q, ψ₁, algo)
+      vᵢ .= q
+    end
+
+    mul!(q, A, vᵢ)
+
+    if i ≥ 2
+      pos5 = pos1 - p
+      pos6 = pos2 - p
+      vᵢ₋₁ = view(V,:,pos5:pos6)
+      mul!(q, vᵢ₋₁, Ψᵢ', α, β)  # q = q - vᵢ₋₁ * Ψᵢᴴ
+    end
+
+    mul!(Ωᵢ, vᵢ', q)       # Ωᵢ = vᵢᴴ * q
+    mul!(q, vᵢ, Ωᵢ, α, β)  # q = q - vᵢ * Ωᵢᴴ
+
+    # Store the block Ωᵢ in Tₖ₊₁.ₖ
+    for ii = 1:p
+      indi = pos1+ii-1
+      for jj = 1:p
+        indj = pos1+jj-1
+        T[indi,indj] = Ωᵢ[ii,jj]
+      end
+    end
+
+    reduced_qr!(q, Ψᵢ₊₁, algo)
+    vᵢ₊₁ .= q
+
+    # Store the block Ψᵢ₊₁ in Tₖ₊₁.ₖ
+    for ii = 1:p
+      indi = pos3+ii-1
+      for jj = 1:p
+        indj = pos1+jj-1
+        (ii ≤ jj) && (T[indi,indj] = Ψᵢ₊₁[ii,jj])
+        (ii ≤ jj) && (i < k) && (T[indj,indi] = conj(Ψᵢ₊₁[ii,jj]))
+      end
+    end
+  end
+  return V, ψ₁, T
+end
+
+"""
+    V, Ψ, T, U, Φᴴ, Tᴴ = nonhermitian_lanczos(A, B, C, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models a square matrix of dimension n;
+* `B`: a matrix of size n × p;
+* `C`: a matrix of size n × p;
+* `k`: the number of iterations of the block non-Hermitian Lanczos process.
+
+#### Output arguments
+
+* `V`: a dense n × p(k+1) matrix;
+* `Ψ`: a dense p × p upper triangular matrix such that V₁Ψ = B;
+* `T`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p;
+* `U`: a dense n × p(k+1) matrix;
+* `Φᴴ`: a dense p × p upper triangular matrix such that U₁Φᴴ = C;
+* `Tᴴ`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p.
+"""
+function nonhermitian_lanczos(A, B::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int) where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(B)
+  Aᴴ = A'
+  pivoting = VERSION < v"1.9" ? Val{false}() : NoPivot()
+
+  nnzT = p*p + (k-1)*p*(2*p+1) + div(p*(p+1), 2)
+  colptr = zeros(Int, p*k+1)
+  rowval = zeros(Int, nnzT)
+  nzval_T = zeros(FC, nnzT)
+  nzval_Tᴴ = zeros(FC, nnzT)
+
+  colptr[1] = 1
+  for j = 1:k*p
+    pos = colptr[j]
+    for i = max(1, j-p):j+p
+      rowval[pos] = i
+      pos += 1
+    end
+    colptr[j+1] = pos
+  end
+
+  V = zeros(FC, n, (k+1)*p)
+  U = zeros(FC, n, (k+1)*p)
+  T = SparseMatrixCSC((k+1)*p, k*p, colptr, rowval, nzval_T)
+  Tᴴ = SparseMatrixCSC((k+1)*p, k*p, colptr, rowval, nzval_Tᴴ)
+
+  α = -one(FC)
+  β = one(FC)
+  qᵥ = zeros(FC, n, p)
+  qᵤ = zeros(FC, n, p)
+  D = Ωᵢ = zeros(FC, p, p)
+  Ψ₁ = zeros(FC, p, p)
+  Φ₁ᴴ = zeros(FC, p, p)
+
+  local Φᵢ, Ψᵢ
+
+  for i = 1:k
+    pos1 = (i-1)*p + 1
+    pos2 = i*p
+    pos3 = pos1 + p
+    pos4 = pos2 + p
+    vᵢ = view(V,:,pos1:pos2)
+    vᵢ₊₁ = view(V,:,pos3:pos4)
+    uᵢ = view(U,:,pos1:pos2)
+    uᵢ₊₁ = view(U,:,pos3:pos4)
+
+    if i == 1
+      mul!(D, C', B)  # D = Cᴴ * B
+      F = lu(D, pivoting)
+      Φᵢ, Ψᵢ = F.L, F.U   # Φᵢ = F.P' * Φᵢ with pivoting
+      Ψ₁ .= Ψᵢ
+      Φ₁ᴴ .= Φᵢ'
+      # vᵢ .= (Ψᵢ' \ B')'
+      # uᵢ .= (Φᵢ \ C')'
+      ldiv!(vᵢ', UpperTriangular(Ψᵢ)', B')
+      ldiv!(uᵢ', LowerTriangular(Φᵢ), C')
+    end
+
+    mul!(qᵥ, A, vᵢ)
+    mul!(qᵤ, Aᴴ, uᵢ)
+
+    if i ≥ 2
+      pos5 = pos1 - p
+      pos6 = pos2 - p
+      vᵢ₋₁ = view(V,:,pos5:pos6)
+      uᵢ₋₁ = view(U,:,pos5:pos6)
+
+      mul!(qᵥ, vᵢ₋₁, Φᵢ, α, β)   # qᵥ = qᵥ - vᵢ₋₁ * Φᵢ
+      mul!(qᵤ, uᵢ₋₁, Ψᵢ', α, β)  # qᵤ = qᵤ - uᵢ₋₁ * Ψᵢᴴ
+    end
+
+    mul!(Ωᵢ, uᵢ', qᵥ)
+    mul!(qᵥ, vᵢ, Ωᵢ, α, β)   # qᵥ = qᵥ - vᵢ * Ωᵢ
+    mul!(qᵤ, uᵢ, Ωᵢ', α, β)  # qᵤ = qᵤ - uᵢ * Ωᵢᴴ
+
+    # Store the block Ωᵢ in Tₖ₊₁.ₖ
+    for ii = 1:p
+      indi = pos1+ii-1
+      for jj = 1:p
+        indj = pos1+jj-1
+        T[indi,indj] = Ωᵢ[ii,jj]
+        Tᴴ[indi,indj] = conj(Ωᵢ[jj,ii])
+      end
+    end
+
+    mul!(D, qᵤ', qᵥ)  # D = qᵤᴴ * qᵥ
+    F = lu(D, pivoting)
+    Φᵢ₊₁, Ψᵢ₊₁ = F.L, F.U  # Φᵢ₊₁ = F.P' * Φᵢ₊₁ with pivoting
+    # vᵢ₊₁ .= (Ψᵢ₊₁' \ qᵥ')'
+    # uᵢ₊₁ .= (Φᵢ₊₁ \ qᵤ')'
+    ldiv!(vᵢ₊₁', UpperTriangular(Ψᵢ₊₁)', qᵥ')
+    ldiv!(uᵢ₊₁', LowerTriangular(Φᵢ₊₁), qᵤ')
+    Φᵢ = Φᵢ₊₁
+    Ψᵢ = Ψᵢ₊₁
+
+    # Store the blocks Ψᵢ₊₁ and Φᵢ₊₁ in Tₖ₊₁.ₖ
+    for ii = 1:p
+      indi = pos3+ii-1
+      for jj = 1:p
+        indj = pos1+jj-1
+        (ii ≤ jj) && (T[indi,indj] = Ψᵢ₊₁[ii,jj])
+        (ii ≤ jj) && (Tᴴ[indi,indj] = conj(Φᵢ₊₁[jj,ii]))
+        (ii ≤ jj) && (i < k) && (Tᴴ[indj,indi] = conj(Ψᵢ₊₁[ii,jj]))
+        (ii ≤ jj) && (i < k) && (T[indj,indi] = Φᵢ₊₁[jj,ii])
+      end
+    end
+  end
+  return V, Ψ₁, T, U, Φ₁ᴴ, Tᴴ
+end
+
+"""
+    V, Γ, H = arnoldi(A, B, k; algo="householder", reorthogonalization=false)
+
+#### Input arguments
+
+* `A`: a linear operator that models a square matrix of dimension n;
+* `B`: a matrix of size n × p;
+* `k`: the number of iterations of the block Arnoldi process.
+
+#### Keyword arguments
+
+* `algo`: the algorithm to perform reduced QR factorizations ("gs", "mgs", "givens" or "householder").
+* `reorthogonalization`: reorthogonalize the new matrices of the block Krylov basis against all previous matrices.
+
+#### Output arguments
+
+* `V`: a dense n × p(k+1) matrix;
+* `Γ`: a dense p × p upper triangular matrix such that V₁Γ = B;
+* `H`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p.
+"""
+function arnoldi(A, B::AbstractMatrix{FC}, k::Int; algo::String="householder", reorthogonalization::Bool=false) where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(B)
+
+  V = zeros(FC, n, (k+1)*p)
+  H = zeros(FC, (k+1)*p, k*p)
+
+  α = -one(FC)
+  β = one(FC)
+  q = zeros(FC, n, p)
+  Γ = zeros(FC, p, p)
+  Ψᵢⱼ = Ψtmp = zeros(FC, p, p)
+
+  for j = 1:k
+    pos1 = (j-1)*p + 1
+    pos2 = j*p
+    pos3 = pos1 + p
+    pos4 = pos2 + p
+    vⱼ = view(V,:,pos1:pos2)
+    vⱼ₊₁ = view(V,:,pos3:pos4)
+
+    if j == 1
+      q .= B
+      reduced_qr!(q, Γ, algo)
+      vⱼ .= q
+    end
+
+    mul!(q, A, vⱼ)
+
+    for i = 1:j
+      pos5 = (i-1)*p + 1
+      pos6 = i*p
+      vᵢ = view(V,:,pos5:pos6)
+      mul!(Ψᵢⱼ, vᵢ', q)       # Ψᵢⱼ = vᵢᴴ * q
+      mul!(q, vᵢ, Ψᵢⱼ, α, β)  # q = q - vᵢ * Ψᵢⱼ
+      H[pos5:pos6,pos1:pos2] .= Ψᵢⱼ
+    end
+
+    if reorthogonalization
+      for i = 1:j
+        pos5 = (i-1)*p + 1
+        pos6 = i*p
+        vᵢ = view(V,:,pos5:pos6)
+        mul!(Ψtmp, vᵢ', q)       # Ψtmp = vᵢᴴ * q
+        mul!(q, vᵢ, Ψtmp, α, β)  # q = q - vᵢ * Ψtmp
+        Hᵢⱼ = view(H,pos5:pos6,pos1:pos2)
+        Hᵢⱼ .= Hᵢⱼ .+ Ψtmp
+      end
+    end
+
+    Ψ = view(H, pos3:pos4,pos1:pos2)
+    reduced_qr!(q, Ψ, algo)
+    vⱼ₊₁ .= q
+  end
+  return V, Γ, H
+end
+
+"""
+    V, U, Ψ, L = golub_kahan(A, B, k; algo="householder")
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `B`: a matrix of size m × p;
+* `k`: the number of iterations of the block Golub-Kahan process.
+
+#### Keyword argument
+
+* `algo`: the algorithm to perform reduced QR factorizations ("gs", "mgs", "givens" or "householder").
+
+#### Output arguments
+
+* `V`: a dense n × p(k+1) matrix;
+* `U`: a dense m × p(k+1) matrix;
+* `Ψ`: a dense p × p upper triangular matrix such that U₁Ψ = B;
+* `L`: a sparse p(k+1) × p(k+1) block lower bidiagonal matrix with a lower bandwidth p.
+"""
+function golub_kahan(A, B::AbstractMatrix{FC}, k::Int; algo::String="householder") where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(B)
+  Aᴴ = A'
+
+  nnzL = p*k*(p+1) + div(p*(p+1), 2)
+  colptr = zeros(Int, p*(k+1)+1)
+  rowval = zeros(Int, nnzL)
+  nzval = zeros(FC, nnzL)
+
+  colptr[1] = 1
+  for j = 1:(k+1)*p
+    pos = colptr[j]
+    for i = j:min((k+1)*p,j+p)
+      rowval[pos] = i
+      pos += 1
+    end
+    colptr[j+1] = pos
+  end
+
+  V = zeros(FC, n, (k+1)*p)
+  U = zeros(FC, m, (k+1)*p)
+  L = SparseMatrixCSC((k+1)*p, (k+1)*p, colptr, rowval, nzval)
+
+  α = -one(FC)
+  β = one(FC)
+  qᵥ = zeros(FC, n, p)
+  qᵤ = zeros(FC, m, p)
+  Ψ₁ = zeros(FC, p, p)
+  Ψᵢ₊₁ = TΩᵢ = TΩᵢ₊₁ = zeros(FC, p, p)
+
+  for i = 1:k
+    pos1 = (i-1)*p + 1
+    pos2 = i*p
+    pos3 = pos1 + p
+    pos4 = pos2 + p
+    vᵢ = view(V,:,pos1:pos2)
+    vᵢ₊₁ = view(V,:,pos3:pos4)
+    uᵢ = view(U,:,pos1:pos2)
+    uᵢ₊₁ = view(U,:,pos3:pos4)
+
+    if i == 1
+      qᵤ .= B
+      reduced_qr!(qᵤ, Ψ₁, algo)
+      uᵢ .= qᵤ
+
+      mul!(qᵥ, Aᴴ, uᵢ)
+      reduced_qr!(qᵥ, TΩᵢ, algo)
+      vᵢ .= qᵥ
+
+      # Store the block Ω₁ in Lₖ₊₁.ₖ₊₁
+      for ii = 1:p
+        indi = pos1+ii-1
+        for jj = 1:p
+          indj = pos1+jj-1
+          (ii ≤ jj) && (L[indj,indi] = conj(TΩᵢ[ii,jj]))
+        end
+      end
+    end
+
+    mul!(qᵤ, A, vᵢ)
+    mul!(qᵤ, uᵢ, TΩᵢ', α, β)  # qᵤ = qᵤ - uᵢ * Ωᵢ
+
+    reduced_qr!(qᵤ, Ψᵢ₊₁, algo)
+    uᵢ₊₁ .= qᵤ
+
+    # Store the block Ψᵢ₊₁ in Lₖ₊₁.ₖ₊₁
+    for ii = 1:p
+      indi = pos3+ii-1
+      for jj = 1:p
+        indj = pos1+jj-1
+        (ii ≤ jj) && (L[indi,indj] = Ψᵢ₊₁[ii,jj])
+      end
+    end
+
+    mul!(qᵥ, Aᴴ, uᵢ₊₁)
+    mul!(qᵥ, vᵢ, Ψᵢ₊₁', α, β)  # qᵥ = qᵥ - vᵢ * Ψᵢ₊₁ᴴ
+
+    reduced_qr!(qᵥ, TΩᵢ₊₁, algo)
+    vᵢ₊₁ .= qᵥ
+
+    # Store the block Ωᵢ₊₁ in Lₖ₊₁.ₖ₊₁
+    for ii = 1:p
+      indi = pos3+ii-1
+      for jj = 1:p
+        indj = pos3+jj-1
+        (ii ≤ jj) && (L[indj,indi] = conj(TΩᵢ₊₁[ii,jj]))
+      end
+    end
+  end
+  return V, U, Ψ₁, L
+end
+
+"""
+    V, Ψ, T, U, Φᴴ, Tᴴ = saunders_simon_yip(A, B, C, k; algo="householder")
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `B`: a matrix of size m × p;
+* `C`: a matrix of size n × p;
+* `k`: the number of iterations of the block Saunders-Simon-Yip process.
+
+#### Keyword argument
+
+* `algo`: the algorithm to perform reduced QR factorizations ("gs", "mgs", "givens" or "householder").
+
+#### Output arguments
+
+* `V`: a dense m × p(k+1) matrix;
+* `Ψ`: a dense p × p upper triangular matrix such that V₁Ψ = B;
+* `T`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p;
+* `U`: a dense n × p(k+1) matrix;
+* `Φᴴ`: a dense p × p upper triangular matrix such that U₁Φᴴ = C;
+* `Tᴴ`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p.
+"""
+function saunders_simon_yip(A, B::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int; algo::String="householder") where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(B)
+  Aᴴ = A'
+
+  nnzT = p*p + (k-1)*p*(2*p+1) + div(p*(p+1), 2)
+  colptr = zeros(Int, p*k+1)
+  rowval = zeros(Int, nnzT)
+  nzval_T = zeros(FC, nnzT)
+  nzval_Tᴴ = zeros(FC, nnzT)
+
+  colptr[1] = 1
+  for j = 1:k*p
+    pos = colptr[j]
+    for i = max(1, j-p):j+p
+      rowval[pos] = i
+      pos += 1
+    end
+    colptr[j+1] = pos
+  end
+
+  V = zeros(FC, m, (k+1)*p)
+  U = zeros(FC, n, (k+1)*p)
+  T = SparseMatrixCSC((k+1)*p, k*p, colptr, rowval, nzval_T)
+  Tᴴ = SparseMatrixCSC((k+1)*p, k*p, colptr, rowval, nzval_Tᴴ)
+
+  α = -one(FC)
+  β = one(FC)
+  qᵥ = zeros(FC, m, p)
+  qᵤ = zeros(FC, n, p)
+  Ψ₁ = zeros(FC, p, p)
+  Φ₁ᴴ = zeros(FC, p, p)
+  Ωᵢ = Ψᵢ = Ψᵢ₊₁ = TΦᵢ = TΦᵢ₊₁ = zeros(FC, p, p)
+
+  for i = 1:k
+    pos1 = (i-1)*p + 1
+    pos2 = i*p
+    pos3 = pos1 + p
+    pos4 = pos2 + p
+    vᵢ = view(V,:,pos1:pos2)
+    vᵢ₊₁ = view(V,:,pos3:pos4)
+    uᵢ = view(U,:,pos1:pos2)
+    uᵢ₊₁ = view(U,:,pos3:pos4)
+
+    if i == 1
+      qᵥ .= B
+      reduced_qr!(qᵥ, Ψ₁, algo)
+      vᵢ .= qᵥ
+      qᵤ .= C
+      reduced_qr!(qᵤ, Φ₁ᴴ, algo)
+      uᵢ .= qᵤ
+    end
+
+    mul!(qᵥ, A, uᵢ)
+    mul!(qᵤ, Aᴴ, vᵢ)
+
+    if i ≥ 2
+      pos5 = pos1 - p
+      pos6 = pos2 - p
+      vᵢ₋₁ = view(V,:,pos5:pos6)
+      uᵢ₋₁ = view(U,:,pos5:pos6)
+
+      mul!(qᵥ, vᵢ₋₁, TΦᵢ', α, β)  # qᵥ = qᵥ - vᵢ₋₁ * Φᵢ
+      for ii = 1:p
+        indi = pos1+ii-1
+        for jj = 1:p
+          indj = pos5+jj-1
+          Ψᵢ[ii,jj] = T[indi,indj]
+        end
+      end
+      mul!(qᵤ, uᵢ₋₁, Ψᵢ', α, β)  # qᵤ = qᵤ - uᵢ₋₁ * Ψᵢᴴ
+    end
+
+    mul!(Ωᵢ, vᵢ', qᵥ)
+    mul!(qᵥ, vᵢ, Ωᵢ, α, β)   # qᵥ = qᵥ - vᵢ * Ωᵢ
+    mul!(qᵤ, uᵢ, Ωᵢ', α, β)  # qᵤ = qᵤ - uᵢ * Ωᵢᴴ
+
+    # Store the block Ωᵢ in Tₖ₊₁.ₖ
+    for ii = 1:p
+      indi = pos1+ii-1
+      for jj = 1:p
+        indj = pos1+jj-1
+        T[indi,indj] = Ωᵢ[ii,jj]
+        Tᴴ[indi,indj] = conj(Ωᵢ[jj,ii])
+      end
+    end
+
+    reduced_qr!(qᵥ, Ψᵢ₊₁, algo)
+    vᵢ₊₁ .= qᵥ
+
+    # Store the block Ψᵢ₊₁ in Tₖ₊₁.ₖ
+    for ii = 1:p
+      indi = pos3+ii-1
+      for jj = 1:p
+        indj = pos1+jj-1
+        (ii ≤ jj) && (T[indi,indj] = Ψᵢ₊₁[ii,jj])
+        (ii ≤ jj) && (i < k) && (Tᴴ[indj,indi] = conj(Ψᵢ₊₁[ii,jj]))
+      end
+    end
+
+    reduced_qr!(qᵤ, TΦᵢ₊₁, algo)
+    uᵢ₊₁ .= qᵤ
+
+    # Store the block Φᵢ₊₁ in Tₖ₊₁.ₖ
+    for ii = 1:p
+      indi = pos3+ii-1
+      for jj = 1:p
+        indj = pos1+jj-1
+        (ii ≤ jj) && (Tᴴ[indi,indj] = TΦᵢ₊₁[ii,jj])
+        (ii ≤ jj) && (i < k) && (T[indj,indi] = conj(TΦᵢ₊₁[ii,jj]))
+      end
+    end
+  end
+  return V, Ψ₁, T, U, Φ₁ᴴ, Tᴴ
+end
+
+"""
+    V, Γ, H, U, Λ, F = montoison_orban(A, B, D, C, k; algo="householder", reorthogonalization=false)
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `B`: a linear operator that models a matrix of dimension n × m;
+* `D`: a matrix of size m × p;
+* `C`: a matrix of size n × p;
+* `k`: the number of iterations of the block Montoison-Orban process.
+
+#### Keyword arguments
+
+* `algo`: the algorithm to perform reduced QR factorizations ("gs", "mgs", "givens" or "householder").
+* `reorthogonalization`: reorthogonalize the new matrices of the block Krylov basis against all previous matrices.
+
+#### Output arguments
+
+* `V`: a dense m × p(k+1) matrix;
+* `Γ`: a dense p × p upper triangular matrix such that V₁Γ = D;
+* `H`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p;
+* `U`: a dense n × p(k+1) matrix;
+* `Λ`: a dense p × p upper triangular matrix such that U₁Λ = C;
+* `F`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p.
+"""
+function montoison_orban(A, B, D::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int; algo::String="householder", reorthogonalization::Bool=false) where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(D)
+
+  V = zeros(FC, m, (k+1)*p)
+  U = zeros(FC, n, (k+1)*p)
+  H = zeros(FC, (k+1)*p, k*p)
+  F = zeros(FC, (k+1)*p, k*p)
+
+  α = -one(FC)
+  β = one(FC)
+  qᵥ = zeros(FC, m, p)
+  qᵤ = zeros(FC, n, p)
+  Γ = zeros(FC, p, p)
+  Λ = zeros(FC, p, p)
+  Ψᵢⱼ = Φᵢⱼ = Ψtmp = Φtmp = zeros(FC, p, p)
+
+  for j = 1:k
+    pos1 = (j-1)*p + 1
+    pos2 = j*p
+    pos3 = pos1 + p
+    pos4 = pos2 + p
+    vⱼ = view(V,:,pos1:pos2)
+    vⱼ₊₁ = view(V,:,pos3:pos4)
+    uⱼ = view(U,:,pos1:pos2)
+    uⱼ₊₁ = view(U,:,pos3:pos4)
+
+    if j == 1
+      qᵥ .= D
+      reduced_qr!(qᵥ, Γ, algo)
+      vⱼ .= qᵥ
+
+      qᵤ .= C
+      reduced_qr!(qᵤ, Λ, algo)
+      uⱼ .= qᵤ
+    end
+
+    mul!(qᵥ, A, uⱼ)
+    mul!(qᵤ, B, vⱼ)
+
+    for i = 1:j
+      pos5 = (i-1)*p + 1
+      pos6 = i*p
+      vᵢ = view(V,:,pos5:pos6)
+      uᵢ = view(U,:,pos5:pos6)
+
+      mul!(Ψᵢⱼ, vᵢ', qᵥ)       # Ψᵢⱼ = vᵢᴴ * qᵥ
+      mul!(qᵥ, vᵢ, Ψᵢⱼ, α, β)  # qᵥ = qᵥ - vᵢ * Ψᵢⱼ
+      H[pos5:pos6,pos1:pos2] .= Ψᵢⱼ
+
+      mul!(Φᵢⱼ, uᵢ', qᵤ)       # Φᵢⱼ = uᵢᴴ * qᵤ
+      mul!(qᵤ, uᵢ, Φᵢⱼ, α, β)  # qᵤ = qᵤ - uᵢ * Φᵢⱼ
+      F[pos5:pos6,pos1:pos2] .= Φᵢⱼ
+    end
+
+    if reorthogonalization
+      for i = 1:j
+        pos5 = (i-1)*p + 1
+        pos6 = i*p
+        vᵢ = view(V,:,pos5:pos6)
+        uᵢ = view(U,:,pos5:pos6)
+
+        mul!(Ψtmp, vᵢ', qᵥ)       # Ψtmp = vᵢᴴ * qᵥ
+        mul!(qᵥ, vᵢ, Ψtmp, α, β)  # qᵥ = qᵥ - vᵢ * Ψtmp
+        Hᵢⱼ = view(H,pos5:pos6,pos1:pos2)
+        Hᵢⱼ .= Hᵢⱼ .+ Ψtmp
+
+        mul!(Φtmp, uᵢ', qᵤ)       # Φtmp = uᵢᴴ * qᵤ
+        mul!(qᵤ, uᵢ, Φtmp, α, β)  # qᵤ = qᵤ - uᵢ * Φtmp
+        Fᵢⱼ = view(F,pos5:pos6,pos1:pos2)
+        Fᵢⱼ .= Fᵢⱼ .+ Φtmp
+      end
+    end
+
+    Ψ = view(H, pos3:pos4,pos1:pos2)
+    reduced_qr!(qᵥ, Ψ, algo)
+    vⱼ₊₁ .= qᵥ
+
+    Φ = view(F, pos3:pos4,pos1:pos2)
+    reduced_qr!(qᵤ, Φ, algo)
+    uⱼ₊₁ .= qᵤ
+  end
+  return V, Γ, H, U, Λ, F
+end
diff --git a/src/krylov_processes.jl b/src/krylov_processes.jl
index ebcf7f513..1541d0ab9 100644
--- a/src/krylov_processes.jl
+++ b/src/krylov_processes.jl
@@ -30,19 +30,20 @@ function hermitian_lanczos(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOr
   nzval = zeros(R, 3k-1)
 
   colptr[1] = 1
-  rowval[1] = 1
-  rowval[2] = 2
   for i = 1:k
+    pos = colptr[i]
     colptr[i+1] = 3i
-    if i ≥ 2
-      pos = colptr[i]
+    if i == 1
+      rowval[pos] = i
+      rowval[pos+1] = i+1
+    else
       rowval[pos] = i-1
       rowval[pos+1] = i
       rowval[pos+2] = i+1
     end
   end
 
-  local β₁
+  β₁ = zero(R)
   V = M(undef, n, k+1)
   T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval)
 
@@ -98,6 +99,7 @@ end
 function nonhermitian_lanczos(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
   m, n = size(A)
   Aᴴ = A'
+  R = real(FC)
   S = ktypeof(b)
   M = vector_to_matrix(S)
 
@@ -107,19 +109,20 @@ function nonhermitian_lanczos(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k
   nzval_Tᴴ = zeros(FC, 3k-1)
 
   colptr[1] = 1
-  rowval[1] = 1
-  rowval[2] = 2
   for i = 1:k
+    pos = colptr[i]
     colptr[i+1] = 3i
-    if i ≥ 2
-      pos = colptr[i]
+    if i == 1
+      rowval[pos] = i
+      rowval[pos+1] = i+1
+    else
       rowval[pos] = i-1
       rowval[pos+1] = i
       rowval[pos+2] = i+1
     end
   end
 
-  local β₁, γ₁ᴴ
+  β₁ = γ₁ᴴ = zero(R)
   V = M(undef, n, k+1)
   U = M(undef, n, k+1)
   T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_T)
@@ -178,7 +181,7 @@ end
 * `b`: a vector of length n;
 * `k`: the number of iterations of the Arnoldi process.
 
-#### Keyword arguments
+#### Keyword argument
 
 * `reorthogonalization`: reorthogonalize the new vectors of the Krylov basis against all previous vectors.
 
@@ -194,10 +197,11 @@ end
 """
 function arnoldi(A, b::AbstractVector{FC}, k::Int; reorthogonalization::Bool=false) where FC <: FloatOrComplex
   m, n = size(A)
+  R = real(FC)
   S = ktypeof(b)
   M = vector_to_matrix(S)
 
-  local β
+  β = zero(R)
   V = M(undef, n, k+1)
   H = zeros(FC, k+1, k)
 
@@ -261,16 +265,19 @@ function golub_kahan(A, b::AbstractVector{FC}, k::Int) where FC <: FloatOrComple
   nzval = zeros(R, 2k+1)
 
   colptr[1] = 1
-  for i = 1:k
+  for i = 1:k+1
     pos = colptr[i]
-    colptr[i+1] = pos+2
-    rowval[pos] = i
-    rowval[pos+1] = i+1
+    if i ≤ k
+      colptr[i+1] = pos + 2
+      rowval[pos] = i
+      rowval[pos+1] = i+1
+    else
+      colptr[i+1] = pos + 1
+      rowval[pos] = i
+    end
   end
-  rowval[2k+1] = k+1
-  colptr[k+2] = 2k+2
 
-  local β₁
+  β₁ = zero(R)
   V = M(undef, n, k+1)
   U = M(undef, m, k+1)
   L = SparseMatrixCSC(k+1, k+1, colptr, rowval, nzval)
@@ -332,6 +339,7 @@ end
 function saunders_simon_yip(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int) where FC <: FloatOrComplex
   m, n = size(A)
   Aᴴ = A'
+  R = real(FC)
   S = ktypeof(b)
   M = vector_to_matrix(S)
 
@@ -341,19 +349,20 @@ function saunders_simon_yip(A, b::AbstractVector{FC}, c::AbstractVector{FC}, k::
   nzval_Tᴴ = zeros(FC, 3k-1)
 
   colptr[1] = 1
-  rowval[1] = 1
-  rowval[2] = 2
   for i = 1:k
+    pos = colptr[i]
     colptr[i+1] = 3i
-    if i ≥ 2
-      pos = colptr[i]
+    if i == 1
+      rowval[pos] = i
+      rowval[pos+1] = i+1
+    else
       rowval[pos] = i-1
       rowval[pos+1] = i
       rowval[pos+2] = i+1
     end
   end
 
-  local β₁, γ₁ᴴ
+  β₁ = γ₁ᴴ = zero(R)
   V = M(undef, m, k+1)
   U = M(undef, n, k+1)
   T = SparseMatrixCSC(k+1, k, colptr, rowval, nzval_T)
@@ -412,7 +421,7 @@ end
 * `c`: a vector of length n;
 * `k`: the number of iterations of the Montoison-Orban process.
 
-#### Keyword arguments
+#### Keyword argument
 
 * `reorthogonalization`: reorthogonalize the new vectors of the Krylov basis against all previous vectors.
 
@@ -431,10 +440,11 @@ end
 """
 function montoison_orban(A, B, b::AbstractVector{FC}, c::AbstractVector{FC}, k::Int; reorthogonalization::Bool=false) where FC <: FloatOrComplex
   m, n = size(A)
+  R = real(FC)
   S = ktypeof(b)
   M = vector_to_matrix(S)
 
-  local β, γ
+  β = γ = zero(R)
   V = M(undef, m, k+1)
   U = M(undef, n, k+1)
   H = zeros(FC, k+1, k)
diff --git a/src/krylov_utils.jl b/src/krylov_utils.jl
index 7689c0f94..e06569c4a 100644
--- a/src/krylov_utils.jl
+++ b/src/krylov_utils.jl
@@ -322,6 +322,47 @@ kaxpby!(n :: Integer, s :: T, x :: AbstractVector{Complex{T}}, dx :: Integer, t
 kcopy!(n :: Integer, x :: Vector{T}, dx :: Integer, y :: Vector{T}, dy :: Integer) where T <: BLAS.BlasFloat = BLAS.blascopy!(n, x, dx, y, dy)
 kcopy!(n :: Integer, x :: AbstractVector{T}, dx :: Integer, y :: AbstractVector{T}, dy :: Integer) where T <: FloatOrComplex = copyto!(y, x)
 
+# geqrf -- Computes the QR factorization of a general m-by-n matrix.
+# orgqr -- Generates the real orthogonal matrix of the QR factorization formed by geqrf.
+# ungqr -- Generates the complex unitary matrix of the QR factorization formed by geqrf.
+# ormqr -- Multiplies a real matrix by the orthogonal matrix of the QR factorization formed by geqrf.
+# unmqr -- Multiplies a complex matrix by the unitary matrix of the QR factorization formed by geqrf.
+
+# getrf -- Computes the LU factorization of a general m-by-n matrix.
+# getrs -- Solves a system of linear equations with an LU-factored square matrix.
+# getri -- Computes the inverse of an LU-factored general matrix.
+
+# sytrf -- Computes the Bunch-Kaufman factorization of a symmetric matrix.
+# sytrs -- Solves a system of linear equations with an LDLᵀ-factored symmetric matrix.
+# sytri -- Computes the inverse of a symmetric matrix using LDLᵀ Bunch-Kaufman factorization.
+
+# hetrf -- Computes the Bunch-Kaufman factorization of a complex Hermitian matrix.
+# hetrs -- Solves a system of linear equations with an LDLᴴ-factored Hermitian matrix.
+# hetri -- Computes the inverse of a complex Hermitian matrix using LDLᴴ Bunch-Kaufman factorization.
+
+# potrf -- Computes the Cholesky factorization of a symmetric (Hermitian) positive-definite matrix.
+# potrs -- Solves a system of linear equations with a Cholesky-factored symmetric (Hermitian) positive-definite matrix.
+# potri -- Computes the inverse of a Cholesky-factored symmetric (Hermitian) positive-definite matrix.
+
+# trtrs -- Solves a system of linear equations with a triangular matrix.
+# trtri -- Computes the inverse of a triangular matrix.
+
+kgeqrf!(A :: Matrix{T}, tau :: Vector{T}) where T <: BLAS.BlasFloat = LAPACK.geqrf!(A, tau)
+korgqr!(A :: Matrix{T}, tau :: Vector{T}) where T <: BLAS.BlasFloat = LAPACK.orgqr!(A, tau)
+kormqr!(side :: Char, trans :: Char, A :: Matrix{T}, tau :: Vector{T}, C :: Matrix{T}) where T <: BLAS.BlasFloat = LAPACK.ormqr!(side, trans, A, tau, C)
+
+macro kgeqrf!(A, tau)
+  return esc(:(Krylov.kgeqrf!($A, $tau)))
+end
+
+macro korgqr!(A, tau)
+  return esc(:(Krylov.korgqr!($A, $tau)))
+end
+
+macro kormqr!(side, trans, A, tau, C)
+  return esc(:(Krylov.kormqr!($side, $trans, $A, $tau, $C)))
+end
+
 # the macros are just for readability, so we don't have to write the increments (always equal to 1)
 macro kdot(n, x, y)
   return esc(:(Krylov.kdot($n, $x, 1, $y, 1)))
diff --git a/src/processes_utils.jl b/src/processes_utils.jl
new file mode 100644
index 000000000..057cafb9d
--- /dev/null
+++ b/src/processes_utils.jl
@@ -0,0 +1,187 @@
+# """
+# Gram-Schmidt orthogonalization for a reduced QR decomposition:
+# Q, R = gs(A)
+#
+# Input :
+# A an n-by-k matrix, n ≥ k
+#
+# Output :
+# Q an n-by-k orthonormal matrix: QᴴQ = Iₖ
+# R an k-by-k upper triangular matrix: QR = A
+# """
+function gs(A::AbstractMatrix{FC}) where FC <: FloatOrComplex
+  n, k = size(A)
+  Q = copy(A)
+  R = zeros(FC, k, k)
+  v = zeros(FC, n)
+  gs!(Q, R, v)
+end
+
+function gs!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, v::AbstractVector{FC}) where FC <: FloatOrComplex
+  n, k = size(Q)
+  aⱼ = v
+  R .= zero(FC)
+  for j = 1:k
+    qⱼ = view(Q,:,j)
+    aⱼ .= qⱼ
+    for i = 1:j-1
+      qᵢ = view(Q,:,i)
+      R[i,j] = @kdot(n, qᵢ, aⱼ)    # rᵢⱼ = ⟨qᵢ , aⱼ⟩
+      @kaxpy!(n, -R[i,j], qᵢ, qⱼ)  # qⱼ = qⱼ - rᵢⱼqᵢ
+    end
+    R[j,j] = @knrm2(n, qⱼ)  # rⱼⱼ = ‖qⱼ‖
+    qⱼ ./= R[j,j]           # qⱼ = qⱼ / rⱼⱼ
+  end
+  return Q, R
+end
+
+# """
+# Modified Gram-Schmidt orthogonalization for a reduced QR decomposition:
+# Q, R = mgs(A)
+#
+# Input :
+# A an n-by-k matrix, n ≥ k
+#
+# # Q an n-by-k orthonormal matrix: QᴴQ = Iₖ
+# # R an k-by-k upper triangular matrix: QR = A
+# """
+function mgs(A::AbstractMatrix{FC}) where FC <: FloatOrComplex
+  n, k = size(A)
+  Q = copy(A)
+  R = zeros(FC, k, k)
+  mgs!(Q, R)
+end
+
+function mgs!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}) where FC <: FloatOrComplex
+  n, k = size(Q)
+  R .= zero(FC)
+  for i = 1:k
+    qᵢ = view(Q,:,i)
+    R[i,i] = @knrm2(n, qᵢ)  # rᵢᵢ = ‖qᵢ‖
+    qᵢ ./= R[i,i]           # qᵢ = qᵢ / rᵢᵢ
+    for j = i+1:k
+      qⱼ = view(Q,:,j)
+      R[i,j] = @kdot(n, qᵢ, qⱼ)    # rᵢⱼ = ⟨qᵢ , qⱼ⟩
+      @kaxpy!(n, -R[i,j], qᵢ, qⱼ)  # qⱼ = qⱼ - rᵢⱼqᵢ
+    end
+  end
+  return Q, R
+end
+
+# Reduced QR factorization with Householder reflections:
+# Q, R = householder(A)
+#
+# Input :
+# A an n-by-k matrix, n ≥ k
+#
+# Output :
+# Q an n-by-k orthonormal matrix: QᴴQ = Iₖ
+# R an k-by-k upper triangular matrix: QR = A
+function householder(A::AbstractMatrix{FC}) where FC <: FloatOrComplex
+  n, k = size(A)
+  Q = copy(A)
+  τ = zeros(FC, k)
+  R = zeros(FC, k, k)
+  householder!(Q, R, τ)
+end
+
+function householder!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, τ::AbstractVector{FC}) where FC <: FloatOrComplex
+  n, k = size(Q)
+  R .= zero(FC)
+  @kgeqrf!(Q, τ)
+  for i = 1:k
+    for j = i:k
+      R[i,j] = Q[i,j]
+    end
+  end
+  @korgqr!(Q, τ)
+  return Q, R
+end
+
+# Reduced QR factorization with Givens reflections:
+# Q, R = givens(A)
+#
+# Input :
+# A an n-by-k matrix, n ≥ k
+#
+# # Q an n-by-k orthonormal matrix: QᴴQ = Iₖ
+# # R an k-by-k upper triangular matrix: QR = A
+# """
+function givens(A::AbstractMatrix{FC}) where FC <: FloatOrComplex
+  n, k = size(A)
+  nr = n*k - div(k*(k+1), 2)
+  T = real(FC)
+  Q = copy(A)
+  R = zeros(FC, k, k)
+  C = zeros(T, nr)
+  S = zeros(FC, nr)
+  givens!(Q, R, C, S)
+end
+
+function givens!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, C::AbstractVector{T}, S::AbstractVector{FC}) where {T <: AbstractFloat, FC <: FloatOrComplex{T}}
+  n, k = size(Q)
+  R .= zero(FC)
+  pos = 0
+  for j = 1:k
+    for i = n-1:-1:j
+      pos += 1
+      C[pos], S[pos], Q[i,j] = sym_givens(Q[i,j], Q[i+1,j])
+      if j < k
+        reflect!(view(Q, i, j+1:k), view(Q, i+1, j+1:k), C[pos], S[pos])
+      end
+    end
+  end
+  for j = 1:k
+    for i = 1:j
+      R[i,j] = Q[i,j]
+    end
+  end
+  Q .= zero(FC)
+  for i = 1:k
+    Q[i,i] = one(FC)
+  end
+  for j = k:-1:1
+    for i = j:n-1
+      reflect!(view(Q, i, j:k), view(Q, i+1, j:k), C[pos], S[pos])
+      pos -= 1
+    end
+  end
+  return Q, R
+end
+
+function reduced_qr!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, algo::String) where FC <: FloatOrComplex
+  n, k = size(Q)
+  T = real(FC)
+  if algo == "gs"
+    v = zeros(FC, n)
+    gs!(Q, R, v)
+  elseif algo == "mgs"
+    mgs!(Q, R)
+  elseif algo == "givens"
+    nr = n*k - div(k*(k+1), 2)
+    C = zeros(T, nr)
+    S = zeros(FC, nr)
+    givens!(Q, R, C, S)
+  elseif algo == "householder"
+    τ = zeros(FC, k)
+    householder!(Q, R, τ)
+  else
+    error("$algo is not a supported method to perform a reduced QR.")
+  end
+  return Q, R
+end
+
+function reduced_qr(A::AbstractMatrix{FC}, algo::String) where FC <: FloatOrComplex
+  if algo == "gs"
+    Q, R = gs(A)
+  elseif algo == "mgs"
+    Q, R = mgs(A)
+  elseif algo == "givens"
+    Q, R = givens(A)
+  elseif algo == "householder"
+    Q, R = householder(A)
+  else
+    error("$algo is not a supported method to perform a reduced QR.")
+  end
+  return Q, R
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index e2fdd6088..040d2edd1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,9 +1,12 @@
 using Krylov, LinearAlgebra, SparseArrays, Printf, Random, Test
 import Krylov.KRYLOV_SOLVERS
 
+Random.seed!(666)
+
 include("test_utils.jl")
 include("test_aux.jl")
 include("test_stats.jl")
+include("test_block_processes.jl")
 include("test_processes.jl")
 
 include("test_minares.jl")
diff --git a/test/test_block_processes.jl b/test/test_block_processes.jl
new file mode 100644
index 000000000..2297dc238
--- /dev/null
+++ b/test/test_block_processes.jl
@@ -0,0 +1,234 @@
+@testset "Block processes" begin
+  m = 250
+  n = 500
+  k = 20
+  p = 4
+  s = 4
+  verbose = false
+
+  for FC in (Float64, ComplexF64)
+    nbits_FC = sizeof(FC)
+    nbits_I = sizeof(Int)
+
+    @testset "Data Type: $FC" begin
+
+      @testset "Block Hermitian Lanczos" begin
+        A = rand(FC, n, n)
+        A = A' * A
+        B = rand(FC, n, p)
+
+        @testset "algo = $algo" for algo in ("householder", "gs", "mgs", "givens")
+          V, Ψ₁, T = hermitian_lanczos(A, B, k; algo)
+
+          @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+          @test V[:,1:p] * Ψ₁ ≈ B
+          @test A * V[:,1:p*k] ≈ V * T
+        end
+
+        @testset "memory" begin
+          function storage_block_hermitian_lanczos_bytes(n, p, k)
+            nnzT = p*p + (k-1)*p*(2*p+1) + div(p*(p+1), 2)
+            memory = 0
+            memory += (p*k+1) * nbits_I       # Tₖ₊₁.ₖ -- colptr
+            memory += nnzT * nbits_I          # Tₖ₊₁.ₖ -- rowval
+            memory += nnzT * nbits_FC         # Tₖ₊₁.ₖ -- nzval
+            memory += p*p * nbits_FC          # Ψ₁
+            memory += p*p * nbits_FC          # Ωᵢ, Ψᵢ and Ψᵢ₊₁
+            memory += n*p * (k+1) * nbits_FC  # Vₖ₊₁
+            memory += p*n * nbits_FC          # q
+            return memory
+          end
+
+          expected_block_hermitian_lanczos_bytes = storage_block_hermitian_lanczos_bytes(n, p, k)
+          actual_block_hermitian_lanczos_bytes = @allocated hermitian_lanczos(A, B, k; algo="mgs")
+          verbose && println("Block Hermitian Lanczos | $FC")
+          verbose && println(expected_block_hermitian_lanczos_bytes, " ≤ ", actual_block_hermitian_lanczos_bytes, " ≤ ", 1.02 * expected_block_hermitian_lanczos_bytes, " ?")
+          verbose && println()
+          @test expected_block_hermitian_lanczos_bytes ≤ actual_block_hermitian_lanczos_bytes ≤ 1.02 * expected_block_hermitian_lanczos_bytes
+        end
+      end
+
+      @testset "Block Non-Hermitian Lanczos" begin
+        A = rand(FC, n, n)
+        B = rand(FC, n, p)
+        C = rand(FC, n, p)
+        V, Ψ₁, T, U, Φ₁ᴴ, Tᴴ = nonhermitian_lanczos(A, B, C, k)
+
+        @test norm(V[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4
+        @test norm(U[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+        @test V[:,1:p] * Ψ₁ ≈ B
+        @test U[:,1:p] * Φ₁ᴴ ≈ C
+        @test T[1:k*p,1:k*p] ≈ Tᴴ[1:k*p,1:k*p]'
+        @test A  * V[:,1:k*p] ≈ V * T
+        @test A' * U[:,1:k*p] ≈ U * Tᴴ
+
+        @testset "memory" begin
+          # storage_block_nonhermitian_lanczos_bytes(n, p, k) = ...
+          #
+          #   expected_block_nonhermitian_lanczos_bytes = storage_block_nonhermitian_lanczos_bytes(n, p, k)
+          #   actual_block_nonhermitian_lanczos_bytes = @allocated nonhermitian_lanczos(A, B, C, k)
+          #   verbose && println("Block non-Hermitian Lanczos")
+          #   verbose && println(expected_block_nonhermitian_lanczos_bytes, " ≤ ", actual_block_nonhermitian_lanczos_bytes, " ≤ ", 1.02 * expected_block_nonhermitian_lanczos_bytes, " ?")
+          #   verbose && println()
+          #   @test expected_block_nonhermitian_lanczos_bytes ≤ actual_block_nonhermitian_lanczos_bytes ≤ 1.02 * expected_block_nonhermitian_lanczos_bytes
+        end
+      end
+
+      @testset "Block Arnoldi" begin
+        A = rand(FC, n, n)
+        B = rand(FC, n, p)
+
+        @testset "algo = $algo" for algo in ("householder", "gs", "mgs", "givens")
+          @testset "reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true)
+            V, Γ, H = arnoldi(A, B, k; algo, reorthogonalization)
+
+            @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+            @test V[:,1:p] * Γ ≈ B
+            @test A * V[:,1:p*k] ≈ V * H
+          end
+        end
+
+        @testset "memory -- reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true)
+          function storage_block_arnoldi_bytes(n, p, k)
+            memory = 0
+            memory += p*n * (k+1) * nbits_FC    # Vₖ₊₁
+            memory += n*p * nbits_FC            # q
+            memory += p*k * p*(k+1) * nbits_FC  # Hₖ₊₁.ₖ
+            memory += p*p * nbits_FC            # Γ
+            memory += p*p * nbits_FC            # Ψᵢⱼ and Ψtmp
+            return memory
+          end
+
+          expected_block_arnoldi_bytes = storage_block_arnoldi_bytes(n, p, k)
+          actual_block_arnoldi_bytes = @allocated arnoldi(A, B, k; algo="mgs", reorthogonalization)
+          verbose && println("Block Arnoldi | $FC")
+          verbose && println(expected_block_arnoldi_bytes, " ≤ ", actual_block_arnoldi_bytes, " ≤ ", 1.02 * expected_block_arnoldi_bytes, " ?")
+          verbose && println()
+          @test expected_block_arnoldi_bytes ≤ actual_block_arnoldi_bytes ≤ 1.02 * expected_block_arnoldi_bytes
+        end
+      end
+
+      @testset "Block Golub-Kahan" begin
+        A = rand(FC, m, n)
+        B = rand(FC, m, p)
+
+        @testset "algo = $algo" for algo in ("householder", "gs", "mgs", "givens")
+          V, U, Ψ₁, L = golub_kahan(A, B, k; algo)
+          BL = L[1:(k+1)*p,1:k*p]
+
+          @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+          @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4
+          @test U[:,1:p] * Ψ₁ ≈ B
+          @test A  * V[:,1:k*p] ≈ U * BL
+          @test A' * U ≈ V * L'
+          @test A' * A  * V[:,1:k*p] ≈ V * L' * BL
+          @test A  * A' * U[:,1:k*p] ≈ U * BL * L[1:k*p,1:k*p]'
+        end
+
+        @testset "memory" begin
+          function storage_block_golub_kahan_bytes(m, n, p, k)
+            nnzL = p*k*(p+1) + div(p*(p+1), 2)
+            memory = 0
+            memory += (p*(k+1)+1) * nbits_I       # Lₖ₊₁ -- colptr
+            memory += nnzL * nbits_I              # Lₖ₊₁ -- rowval
+            memory += nnzL * nbits_FC             # Lₖ₊₁ -- nzval
+            memory += p*p * nbits_FC              # Ψ₁
+            memory += p*p * nbits_FC              # Ψᵢ₊₁, TΩᵢ and TΩᵢ₊₁
+            memory += p*(n+m) * (k+1) * nbits_FC  # Vₖ₊₁ and Uₖ₊₁
+            memory += p*(n+m) * nbits_FC          # qᵥ and qᵤ
+            return memory
+          end
+
+          expected_block_golub_kahan_bytes = storage_block_golub_kahan_bytes(m, n, p, k)
+          actual_block_golub_kahan_bytes = @allocated golub_kahan(A, B, k; algo="mgs")
+          verbose && println("Block Golub-Kahan | $FC")
+          verbose && println(expected_block_golub_kahan_bytes, " ≤ ", actual_block_golub_kahan_bytes, " ≤ ", 1.02 * expected_block_golub_kahan_bytes, " ?")
+          verbose && println()
+          @test expected_block_golub_kahan_bytes ≤ actual_block_golub_kahan_bytes ≤ 1.02 * expected_block_golub_kahan_bytes
+        end
+      end
+
+      @testset "Block Saunders-Simon-Yip" begin
+        A = rand(FC, m, n)
+        B = rand(FC, m, p)
+        C = rand(FC, n, p)
+
+        @testset "algo = $algo" for algo in ("householder", "gs", "mgs", "givens")
+          V, Ψ₁, T, U, Φ₁ᴴ, Tᴴ = saunders_simon_yip(A, B, C, k; algo)
+
+          @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+          @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4
+          @test V[:,1:p] * Ψ₁ ≈ B
+          @test U[:,1:p] * Φ₁ᴴ ≈ C
+          @test T[1:k*p,1:k*p] ≈ Tᴴ[1:k*p,1:k*p]'
+          @test A  * U[:,1:k*p] ≈ V * T
+          @test A' * V[:,1:k*p] ≈ U * Tᴴ
+          @test A' * A  * U[:,1:(k-1)*p] ≈ U * Tᴴ * T[1:k*p,1:(k-1)*p]
+          @test A  * A' * V[:,1:(k-1)*p] ≈ V * T * Tᴴ[1:k*p,1:(k-1)*p]
+        end
+
+        @testset "memory" begin
+          function storage_block_saunders_simon_yip_bytes(m, n, p, k)
+            nnzT = p*p + (k-1)*p*(2*p+1) + div(p*(p+1), 2)
+            memory = 0
+            memory += (p*k+1) * nbits_I           # Tₖ₊₁.ₖ and (Tₖ.ₖ₊₁)ᴴ -- colptr
+            memory += nnzT * nbits_I              # Tₖ₊₁.ₖ and (Tₖ.ₖ₊₁)ᴴ -- rowval
+            memory += 2 * nnzT * nbits_FC         # Tₖ₊₁.ₖ and (Tₖ.ₖ₊₁)ᴴ -- nzval
+            memory += 2 * p*p * nbits_FC          # Ψ₁ and Φ₁ᴴ
+            memory += p*p * nbits_FC              # Ωᵢ, Ψᵢ, Ψᵢ₊₁, TΦᵢ and TΦᵢ₊₁
+            memory += p*(n+m) * (k+1) * nbits_FC  # Vₖ₊₁ and Uₖ₊₁
+            memory += p*(n+m) * nbits_FC          # qᵥ and qᵤ
+          end
+
+          expected_block_saunders_simon_yip_bytes = storage_block_saunders_simon_yip_bytes(m, n, p, k)
+          actual_block_saunders_simon_yip_bytes = @allocated saunders_simon_yip(A, B, C, k; algo="mgs")
+          verbose && println("Block Saunders-Simon-Yip")
+          verbose && println(expected_block_saunders_simon_yip_bytes, " ≤ ", actual_block_saunders_simon_yip_bytes, " ≤ ", 1.02 * expected_block_saunders_simon_yip_bytes, " ?")
+          verbose && println()
+          @test expected_block_saunders_simon_yip_bytes ≤ actual_block_saunders_simon_yip_bytes ≤ 1.02 * expected_block_saunders_simon_yip_bytes
+        end
+      end
+
+      @testset "Block Montoison-Orban" begin
+        A = rand(FC, m, n)
+        B = rand(FC, n, m)
+        D = rand(FC, m, p)
+        C = rand(FC, n, p)
+
+        @testset "algo = $algo" for algo in ("householder", "gs", "mgs", "givens")
+          @testset "reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true)
+            V, Γ, H, U, Λ, F = montoison_orban(A, B, D, C, k; algo, reorthogonalization)
+
+            @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+            @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4
+            @test V[:,1:p] * Γ ≈ D
+            @test U[:,1:p] * Λ ≈ C
+            @test A * U[:,1:k*p] ≈ V * H
+            @test B * V[:,1:k*p] ≈ U * F
+            @test B * A * U[:,1:(k-1)*p] ≈ U * F * H[1:k*p,1:(k-1)*p]
+            @test A * B * V[:,1:(k-1)*p] ≈ V * H * F[1:k*p,1:(k-1)*p]
+          end
+        end
+
+        @testset "memory -- reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true)
+          function storage_block_montoison_orban_bytes(m, n, p, k)
+            memory = 0
+            memory += 2 * p*p * nbits_FC            # Γ and Λ
+            memory += p*p * nbits_FC                # Ψᵢⱼ, Φᵢⱼ, Ψtmp and Φtmp
+            memory += 2 * p*k * p*(k+1) * nbits_FC  # Hₖ₊₁.ₖ and Fₖ₊₁.ₖ
+            memory += p*(n+m) * (k+1) * nbits_FC    # Vₖ₊₁ and Uₖ₊₁
+            memory += p*(n+m)* nbits_FC             # qᵥ and qᵤ
+            return memory
+          end
+
+          expected_block_montoison_orban_bytes = storage_block_montoison_orban_bytes(m, n, p, k)
+          actual_block_montoison_orban_bytes = @allocated montoison_orban(A, B, D, C, k; algo="mgs", reorthogonalization)
+          verbose && println("Block Montoison-Orban | $FC")
+          verbose && println(expected_block_montoison_orban_bytes, " ≤ ", actual_block_montoison_orban_bytes, " ≤ ", 1.02 * expected_block_montoison_orban_bytes, " ?")
+          verbose && println()
+          @test expected_block_montoison_orban_bytes ≤ actual_block_montoison_orban_bytes ≤ 1.02 * expected_block_montoison_orban_bytes
+        end
+      end
+    end
+  end
+end
diff --git a/test/test_cr.jl b/test/test_cr.jl
index 5ae0774bd..45366ce45 100644
--- a/test/test_cr.jl
+++ b/test/test_cr.jl
@@ -23,7 +23,6 @@
 
         # Sparse Laplacian
         A, _ = sparse_laplacian(FC=FC)
-        Random.seed!(0)
         b = randn(size(A, 1))
         itmax = 0
         # case: ‖x*‖ > Δ
diff --git a/test/test_processes.jl b/test/test_processes.jl
index 237f70ce5..05c0f4162 100644
--- a/test/test_processes.jl
+++ b/test/test_processes.jl
@@ -20,7 +20,7 @@ end
   n = 500
   k = 20
   s = 5
-  
+
   for FC in (Float64, ComplexF64)
     R = real(FC)
     nbits_FC = sizeof(FC)
@@ -28,7 +28,7 @@ end
     nbits_I = sizeof(Int)
 
     @testset "Data Type: $FC" begin
-      
+
       @testset "Hermitian Lanczos" begin
         A = rand(FC, n, n)
         A = A' * A