From 9f25aec48a182e7e480b1d278a767819bea1dd5c Mon Sep 17 00:00:00 2001
From: Alexis Montoison <alexis.montoison@polymtl.ca>
Date: Sun, 19 May 2024 23:24:07 -0400
Subject: [PATCH] Support preconditioners in BiLQ and QMR

---
 docs/src/preconditioners.md |  5 ++-
 src/bilq.jl                 | 73 +++++++++++++++++++++++++++----------
 src/krylov_solvers.jl       | 12 +++++-
 src/qmr.jl                  | 70 ++++++++++++++++++++++++++---------
 src/usymlq.jl               |  3 +-
 src/usymqr.jl               |  3 +-
 test/test_bicgstab.jl       |  4 +-
 test/test_bilq.jl           | 25 ++++++++++++-
 test/test_cgs.jl            |  4 +-
 test/test_qmr.jl            | 24 ++++++++++++
 10 files changed, 175 insertions(+), 48 deletions(-)

diff --git a/docs/src/preconditioners.md b/docs/src/preconditioners.md
index 22e24ce90..38da24cc5 100644
--- a/docs/src/preconditioners.md
+++ b/docs/src/preconditioners.md
@@ -30,12 +30,13 @@ Krylov.jl supports both approaches thanks to the argument `ldiv` of the Krylov s
 ## How to use preconditioners in Krylov.jl?
 
 !!! info
-    - A preconditioner only need support the operation `mul!(y, P⁻¹, x)` when `ldiv=false` or `ldiv!(y, P, x)` when `ldiv=true` to be used in Krylov.jl.
+    - A preconditioner only needs to support the operation `mul!(y, P⁻¹, x)` when `ldiv=false` or `ldiv!(y, P, x)` when `ldiv=true` to be used in Krylov.jl.
+    - Additional support for `adjoint` with preconditioners is required in the methods [`BILQ`](@ref bilq) and [`QMR`](@ref qmr).
     - The default value of a preconditioner in Krylov.jl is the identity operator `I`.
 
 ### Square non-Hermitian linear systems
 
-Methods concerned: [`CGS`](@ref cgs), [`BiCGSTAB`](@ref bicgstab), [`DQGMRES`](@ref dqgmres), [`GMRES`](@ref gmres), [`BLOCK-GMRES`](@ref block_gmres), [`FGMRES`](@ref fgmres), [`DIOM`](@ref diom) and [`FOM`](@ref fom).
+Methods concerned: [`CGS`](@ref cgs), [`BILQ`](@ref bilq), [`QMR`](@ref qmr), [`BiCGSTAB`](@ref bicgstab), [`DQGMRES`](@ref dqgmres), [`GMRES`](@ref gmres), [`BLOCK-GMRES`](@ref block_gmres), [`FGMRES`](@ref fgmres), [`DIOM`](@ref diom) and [`FOM`](@ref fom).
 
 A Krylov method dedicated to non-Hermitian linear systems allows the three variants of preconditioning.
 
diff --git a/src/bilq.jl b/src/bilq.jl
index 2e8823e93..285c2d36a 100644
--- a/src/bilq.jl
+++ b/src/bilq.jl
@@ -15,8 +15,9 @@ export bilq, bilq!
 """
     (x, stats) = bilq(A, b::AbstractVector{FC};
                       c::AbstractVector{FC}=b, transfer_to_bicg::Bool=true,
-                      atol::T=√eps(T), rtol::T=√eps(T), itmax::Int=0,
-                      timemax::Float64=Inf, verbose::Int=0, history::Bool=false,
+                      M=I, N=I, ldiv::Bool=false, atol::T=√eps(T),
+                      rtol::T=√eps(T), itmax::Int=0, timemax::Float64=Inf,
+                      verbose::Int=0, history::Bool=false,
                       callback=solver->false, iostream::IO=kstdout)
 
 `T` is an `AbstractFloat` such as `Float32`, `Float64` or `BigFloat`.
@@ -30,6 +31,7 @@ Solve the square linear system Ax = b of size n using BiLQ.
 BiLQ is based on the Lanczos biorthogonalization process and requires two initial vectors `b` and `c`.
 The relation `bᴴc ≠ 0` must be satisfied and by default `c = b`.
 When `A` is Hermitian and `b = c`, BiLQ is equivalent to SYMMLQ.
+BiLQ requires support for `adjoint(M)` and `adjoint(N)` if preconditioners are provided.
 
 #### Input arguments
 
@@ -44,6 +46,9 @@ When `A` is Hermitian and `b = c`, BiLQ is equivalent to SYMMLQ.
 
 * `c`: the second initial vector of length `n` required by the Lanczos biorthogonalization process;
 * `transfer_to_bicg`: transfer from the BiLQ point to the BiCG point, when it exists. The transfer is based on the residual norm;
+* `M`: linear operator that models a nonsingular matrix of size `n` used for left preconditioning;
+* `N`: linear operator that models a nonsingular matrix of size `n` used for right preconditioning;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
 * `atol`: absolute stopping tolerance based on the residual norm;
 * `rtol`: relative stopping tolerance based on the residual norm;
 * `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
@@ -82,6 +87,9 @@ def_optargs_bilq = (:(x0::AbstractVector),)
 
 def_kwargs_bilq = (:(; c::AbstractVector{FC} = b    ),
                    :(; transfer_to_bicg::Bool = true),
+                   :(; M = I                        ),
+                   :(; N = I                        ),
+                   :(; ldiv::Bool = false           ),
                    :(; atol::T = √eps(T)            ),
                    :(; rtol::T = √eps(T)            ),
                    :(; itmax::Int = 0               ),
@@ -95,7 +103,7 @@ def_kwargs_bilq = mapreduce(extract_parameters, vcat, def_kwargs_bilq)
 
 args_bilq = (:A, :b)
 optargs_bilq = (:x0,)
-kwargs_bilq = (:c, :transfer_to_bicg, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback, :iostream)
+kwargs_bilq = (:c, :transfer_to_bicg, :M, :N, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback, :iostream)
 
 @eval begin
   function bilq($(def_args_bilq...), $(def_optargs_bilq...); $(def_kwargs_bilq...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}}
@@ -131,26 +139,42 @@ kwargs_bilq = (:c, :transfer_to_bicg, :atol, :rtol, :itmax, :timemax, :verbose,
     length(b) == m || error("Inconsistent problem size")
     (verbose > 0) && @printf(iostream, "BILQ: system of size %d\n", n)
 
+    # Check M = Iₙ and N = Iₙ
+    MisI = (M === I)
+    NisI = (N === I)
+
     # Check type consistency
     eltype(A) == FC || @warn "eltype(A) ≠ $FC. This could lead to errors or additional allocations in operator-vector products."
     ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
     ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
-    # Compute the adjoint of A
+    # Compute the adjoint of A, M and N
     Aᴴ = A'
+    Mᴴ = M'
+    Nᴴ = N'
 
     # Set up workspace.
+    allocate_if(!MisI, solver, :t, S, n)
+    allocate_if(!NisI, solver, :s, S, n)
     uₖ₋₁, uₖ, q, vₖ₋₁, vₖ = solver.uₖ₋₁, solver.uₖ, solver.q, solver.vₖ₋₁, solver.vₖ
     p, Δx, x, d̅, stats = solver.p, solver.Δx, solver.x, solver.d̅, solver.stats
     warm_start = solver.warm_start
     rNorms = stats.residuals
     reset!(stats)
     r₀ = warm_start ? q : b
+    Mᴴuₖ = MisI ? uₖ : solver.t
+    t = MisI ? q : solver.t
+    Nvₖ = NisI ? vₖ : solver.s
+    s = NisI ? p : solver.s
 
     if warm_start
       mul!(r₀, A, Δx)
       @kaxpby!(n, one(FC), b, -one(FC), r₀)
     end
+    if !MisI
+      mul!(p, M, r₀)
+      r₀ = p
+    end
 
     # Initial solution x₀ and residual norm ‖r₀‖.
     x .= zero(FC)
@@ -170,10 +194,6 @@ kwargs_bilq = (:c, :transfer_to_bicg, :atol, :rtol, :itmax, :timemax, :verbose,
     iter = 0
     itmax == 0 && (itmax = 2*n)
 
-    ε = atol + rtol * bNorm
-    (verbose > 0) && @printf(iostream, "%5s  %7s  %5s\n", "k", "‖rₖ‖", "timer")
-    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %.2fs\n", iter, bNorm, ktimer(start_time))
-
     # Initialize the Lanczos biorthogonalization process.
     cᴴb = @kdot(n, c, r₀)  # ⟨c,r₀⟩
     if cᴴb == 0
@@ -186,6 +206,10 @@ kwargs_bilq = (:c, :transfer_to_bicg, :atol, :rtol, :itmax, :timemax, :verbose,
       return solver
     end
 
+    ε = atol + rtol * bNorm
+    (verbose > 0) && @printf(iostream, "%5s  %8s  %7s  %5s\n", "k", "αₖ", "‖rₖ‖", "timer")
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.1e  %7.1e  %.2fs\n", iter, cᴴb, bNorm, ktimer(start_time))
+
     βₖ = √(abs(cᴴb))            # β₁γ₁ = cᴴ(b - Ax₀)
     γₖ = cᴴb / βₖ               # β₁γ₁ = cᴴ(b - Ax₀)
     vₖ₋₁ .= zero(FC)            # v₀ = 0
@@ -214,23 +238,30 @@ kwargs_bilq = (:c, :transfer_to_bicg, :atol, :rtol, :itmax, :timemax, :verbose,
       iter = iter + 1
 
       # Continue the Lanczos biorthogonalization process.
-      # AVₖ  = VₖTₖ    + βₖ₊₁vₖ₊₁(eₖ)ᵀ = Vₖ₊₁Tₖ₊₁.ₖ
-      # AᴴUₖ = Uₖ(Tₖ)ᴴ + γ̄ₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᴴ
+      # MANVₖ    = VₖTₖ    + βₖ₊₁vₖ₊₁(eₖ)ᵀ = Vₖ₊₁Tₖ₊₁.ₖ
+      # NᴴAᴴMᴴUₖ = Uₖ(Tₖ)ᴴ + γ̄ₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᴴ
 
-      mul!(q, A , vₖ)  # Forms vₖ₊₁ : q ← Avₖ
-      mul!(p, Aᴴ, uₖ)  # Forms uₖ₊₁ : p ← Aᴴuₖ
+      # Forms vₖ₊₁ : q ← MANvₖ
+      NisI || mulorldiv!(Nvₖ, N, vₖ, ldiv)
+      mul!(t, A, Nvₖ)
+      MisI || mulorldiv!(q, M, t, ldiv)
+
+      # Forms uₖ₊₁ : p ← NᴴAᴴMᴴuₖ
+      MisI || mulorldiv!(Mᴴuₖ, Mᴴ, uₖ, ldiv)
+      mul!(s, Aᴴ, Mᴴuₖ)
+      NisI || mulorldiv!(p, Nᴴ, s, ldiv)
 
       @kaxpy!(n, -γₖ, vₖ₋₁, q)  # q ← q - γₖ * vₖ₋₁
       @kaxpy!(n, -βₖ, uₖ₋₁, p)  # p ← p - β̄ₖ * uₖ₋₁
 
-      αₖ = @kdot(n, uₖ, q)      # αₖ = ⟨uₖ,q⟩
+      αₖ = @kdot(n, uₖ, q)  # αₖ = ⟨uₖ,q⟩
 
-      @kaxpy!(n, -     αₖ , vₖ, q)    # q ← q - αₖ * vₖ
-      @kaxpy!(n, -conj(αₖ), uₖ, p)    # p ← p - ᾱₖ * uₖ
+      @kaxpy!(n, -     αₖ , vₖ, q)  # q ← q - αₖ * vₖ
+      @kaxpy!(n, -conj(αₖ), uₖ, p)  # p ← p - ᾱₖ * uₖ
 
-      pᴴq = @kdot(n, p, q)      # pᴴq  = ⟨p,q⟩
-      βₖ₊₁ = √(abs(pᴴq))        # βₖ₊₁ = √(|pᴴq|)
-      γₖ₊₁ = pᴴq / βₖ₊₁         # γₖ₊₁ = pᴴq / βₖ₊₁
+      pᴴq = @kdot(n, p, q)  # pᴴq  = ⟨p,q⟩
+      βₖ₊₁ = √(abs(pᴴq))    # βₖ₊₁ = √(|pᴴq|)
+      γₖ₊₁ = pᴴq / βₖ₊₁     # γₖ₊₁ = pᴴq / βₖ₊₁
 
       # Update the LQ factorization of Tₖ = L̅ₖQₖ.
       # [ α₁ γ₂ 0  •  •  •  0 ]   [ δ₁   0    •   •   •    •    0   ]
@@ -353,7 +384,7 @@ kwargs_bilq = (:c, :transfer_to_bicg, :atol, :rtol, :itmax, :timemax, :verbose,
       breakdown = !solved_lq && !solved_cg && (pᴴq == 0)
       timer = time_ns() - start_time
       overtimed = timer > timemax_ns
-      kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %.2fs\n", iter, rNorm_lq, ktimer(start_time))
+      kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.1e  %7.1e  %.2fs\n", iter, αₖ, rNorm_lq, ktimer(start_time))
     end
     (verbose > 0) && @printf(iostream, "\n")
 
@@ -372,6 +403,10 @@ kwargs_bilq = (:c, :transfer_to_bicg, :atol, :rtol, :itmax, :timemax, :verbose,
     overtimed           && (status = "time limit exceeded")
 
     # Update x
+    if !NisI
+      copyto!(solver.s, x)
+      mulorldiv!(x, N, solver.s, ldiv)
+    end
     warm_start && @kaxpy!(n, one(FC), Δx, x)
     solver.warm_start = false
 
diff --git a/src/krylov_solvers.jl b/src/krylov_solvers.jl
index 13c98f823..d9f9fd9b1 100644
--- a/src/krylov_solvers.jl
+++ b/src/krylov_solvers.jl
@@ -1002,6 +1002,8 @@ mutable struct BilqSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   Δx         :: S
   x          :: S
   d̅          :: S
+  t          :: S
+  s          :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
 end
@@ -1018,8 +1020,10 @@ function BilqSolver(m, n, S)
   Δx   = S(undef, 0)
   x    = S(undef, n)
   d̅    = S(undef, n)
+  t    = S(undef, 0)
+  s    = S(undef, 0)
   stats = SimpleStats(0, false, false, T[], T[], T[], 0.0, "unknown")
-  solver = BilqSolver{T,FC,S}(m, n, uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p, Δx, x, d̅, false, stats)
+  solver = BilqSolver{T,FC,S}(m, n, uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p, Δx, x, d̅, t, s, false, stats)
   return solver
 end
 
@@ -1052,6 +1056,8 @@ mutable struct QmrSolver{T,FC,S} <: KrylovSolver{T,FC,S}
   x          :: S
   wₖ₋₂       :: S
   wₖ₋₁       :: S
+  t          :: S
+  s          :: S
   warm_start :: Bool
   stats      :: SimpleStats{T}
 end
@@ -1069,8 +1075,10 @@ function QmrSolver(m, n, S)
   x    = S(undef, n)
   wₖ₋₂ = S(undef, n)
   wₖ₋₁ = S(undef, n)
+  t    = S(undef, 0)
+  s    = S(undef, 0)
   stats = SimpleStats(0, false, false, T[], T[], T[], 0.0, "unknown")
-  solver = QmrSolver{T,FC,S}(m, n, uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p, Δx, x, wₖ₋₂, wₖ₋₁, false, stats)
+  solver = QmrSolver{T,FC,S}(m, n, uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p, Δx, x, wₖ₋₂, wₖ₋₁, t, s, false, stats)
   return solver
 end
 
diff --git a/src/qmr.jl b/src/qmr.jl
index 995392f0c..5f053571d 100644
--- a/src/qmr.jl
+++ b/src/qmr.jl
@@ -22,7 +22,7 @@ export qmr, qmr!
 
 """
     (x, stats) = qmr(A, b::AbstractVector{FC};
-                     c::AbstractVector{FC}=b, atol::T=√eps(T),
+                     c::AbstractVector{FC}=b, M=I, N=I, ldiv::Bool=false, atol::T=√eps(T),
                      rtol::T=√eps(T), itmax::Int=0, timemax::Float64=Inf, verbose::Int=0,
                      history::Bool=false, callback=solver->false, iostream::IO=kstdout)
 
@@ -38,6 +38,7 @@ Solve the square linear system Ax = b of size n using QMR.
 QMR is based on the Lanczos biorthogonalization process and requires two initial vectors `b` and `c`.
 The relation `bᴴc ≠ 0` must be satisfied and by default `c = b`.
 When `A` is Hermitian and `b = c`, QMR is equivalent to MINRES.
+QMR requires support for `adjoint(M)` and `adjoint(N)` if preconditioners are provided.
 
 #### Input arguments
 
@@ -51,6 +52,9 @@ When `A` is Hermitian and `b = c`, QMR is equivalent to MINRES.
 #### Keyword arguments
 
 * `c`: the second initial vector of length `n` required by the Lanczos biorthogonalization process;
+* `M`: linear operator that models a nonsingular matrix of size `n` used for left preconditioning;
+* `N`: linear operator that models a nonsingular matrix of size `n` used for right preconditioning;
+* `ldiv`: define whether the preconditioners use `ldiv!` or `mul!`;
 * `atol`: absolute stopping tolerance based on the residual norm;
 * `rtol`: relative stopping tolerance based on the residual norm;
 * `itmax`: the maximum number of iterations. If `itmax=0`, the default number of iterations is set to `2n`;
@@ -89,6 +93,9 @@ def_args_qmr = (:(A                    ),
 def_optargs_qmr = (:(x0::AbstractVector),)
 
 def_kwargs_qmr = (:(; c::AbstractVector{FC} = b ),
+                  :(; M = I                     ),
+                  :(; N = I                     ),
+                  :(; ldiv::Bool = false        ),
                   :(; atol::T = √eps(T)         ),
                   :(; rtol::T = √eps(T)         ),
                   :(; itmax::Int = 0            ),
@@ -102,7 +109,7 @@ def_kwargs_qmr = mapreduce(extract_parameters, vcat, def_kwargs_qmr)
 
 args_qmr = (:A, :b)
 optargs_qmr = (:x0,)
-kwargs_qmr = (:c, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback, :iostream)
+kwargs_qmr = (:c, :M, :N, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback, :iostream)
 
 @eval begin
   function qmr($(def_args_qmr...), $(def_optargs_qmr...); $(def_kwargs_qmr...)) where {T <: AbstractFloat, FC <: FloatOrComplex{T}}
@@ -138,26 +145,42 @@ kwargs_qmr = (:c, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback,
     length(b) == m || error("Inconsistent problem size")
     (verbose > 0) && @printf(iostream, "QMR: system of size %d\n", n)
 
+    # Check M = Iₙ and N = Iₙ
+    MisI = (M === I)
+    NisI = (N === I)
+
     # Check type consistency
     eltype(A) == FC || @warn "eltype(A) ≠ $FC. This could lead to errors or additional allocations in operator-vector products."
     ktypeof(b) <: S || error("ktypeof(b) is not a subtype of $S")
     ktypeof(c) <: S || error("ktypeof(c) is not a subtype of $S")
 
-    # Compute the adjoint of A
+    # Compute the adjoint of A, M and N
     Aᴴ = A'
+    Mᴴ = M'
+    Nᴴ = N'
 
     # Set up workspace.
+    allocate_if(!MisI, solver, :t, S, n)
+    allocate_if(!NisI, solver, :s, S, n)
     uₖ₋₁, uₖ, q, vₖ₋₁, vₖ, p = solver.uₖ₋₁, solver.uₖ, solver.q, solver.vₖ₋₁, solver.vₖ, solver.p
     Δx, x, wₖ₋₂, wₖ₋₁, stats = solver.Δx, solver.x, solver.wₖ₋₂, solver.wₖ₋₁, solver.stats
     warm_start = solver.warm_start
     rNorms = stats.residuals
     reset!(stats)
     r₀ = warm_start ? q : b
+    Mᴴuₖ = MisI ? uₖ : solver.t
+    t = MisI ? q : solver.t
+    Nvₖ = NisI ? vₖ : solver.s
+    s = NisI ? p : solver.s
 
     if warm_start
       mul!(r₀, A, Δx)
       @kaxpby!(n, one(FC), b, -one(FC), r₀)
     end
+    if !MisI
+      mul!(p, M, r₀)
+      r₀ = p
+    end
 
     # Initial solution x₀ and residual norm ‖r₀‖.
     x .= zero(FC)
@@ -177,10 +200,6 @@ kwargs_qmr = (:c, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback,
     iter = 0
     itmax == 0 && (itmax = 2*n)
 
-    ε = atol + rtol * rNorm
-    (verbose > 0) && @printf(iostream, "%5s  %7s  %5s\n", "k", "‖rₖ‖", "timer")
-    kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %.2fs\n", iter, rNorm, ktimer(start_time))
-
     # Initialize the Lanczos biorthogonalization process.
     cᴴb = @kdot(n, c, r₀)  # ⟨c,r₀⟩
     if cᴴb == 0
@@ -193,6 +212,10 @@ kwargs_qmr = (:c, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback,
       return solver
     end
 
+    ε = atol + rtol * rNorm
+    (verbose > 0) && @printf(iostream, "%5s  %8s  %7s  %5s\n", "k", "αₖ", "‖rₖ‖", "timer")
+    kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.1e  %7.1e  %.2fs\n", iter, cᴴb, rNorm, ktimer(start_time))
+
     βₖ = √(abs(cᴴb))             # β₁γ₁ = cᴴ(b - Ax₀)
     γₖ = cᴴb / βₖ                # β₁γ₁ = cᴴ(b - Ax₀)
     vₖ₋₁ .= zero(FC)             # v₀ = 0
@@ -219,23 +242,30 @@ kwargs_qmr = (:c, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback,
       iter = iter + 1
 
       # Continue the Lanczos biorthogonalization process.
-      # AVₖ  = VₖTₖ    + βₖ₊₁vₖ₊₁(eₖ)ᵀ = Vₖ₊₁Tₖ₊₁.ₖ
-      # AᴴUₖ = Uₖ(Tₖ)ᴴ + γ̄ₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᴴ
+      # MANVₖ    = VₖTₖ    + βₖ₊₁vₖ₊₁(eₖ)ᵀ = Vₖ₊₁Tₖ₊₁.ₖ
+      # NᴴAᴴMᴴUₖ = Uₖ(Tₖ)ᴴ + γ̄ₖ₊₁uₖ₊₁(eₖ)ᵀ = Uₖ₊₁(Tₖ.ₖ₊₁)ᴴ
 
-      mul!(q, A , vₖ)  # Forms vₖ₊₁ : q ← Avₖ
-      mul!(p, Aᴴ, uₖ)  # Forms uₖ₊₁ : p ← Aᴴuₖ
+      # Forms vₖ₊₁ : q ← MANvₖ
+      NisI || mulorldiv!(Nvₖ, N, vₖ, ldiv)
+      mul!(t, A, Nvₖ)
+      MisI || mulorldiv!(q, M, t, ldiv)
+
+      # Forms uₖ₊₁ : p ← NᴴAᴴMᴴuₖ
+      MisI || mulorldiv!(Mᴴuₖ, Mᴴ, uₖ, ldiv)
+      mul!(s, Aᴴ, Mᴴuₖ)
+      NisI || mulorldiv!(p, Nᴴ, s, ldiv)
 
       @kaxpy!(n, -γₖ, vₖ₋₁, q)  # q ← q - γₖ * vₖ₋₁
       @kaxpy!(n, -βₖ, uₖ₋₁, p)  # p ← p - β̄ₖ * uₖ₋₁
 
-      αₖ = @kdot(n, uₖ, q)      # αₖ = ⟨uₖ,q⟩
+      αₖ = @kdot(n, uₖ, q)  # αₖ = ⟨uₖ,q⟩
 
-      @kaxpy!(n, -     αₖ , vₖ, q)    # q ← q - αₖ * vₖ
-      @kaxpy!(n, -conj(αₖ), uₖ, p)    # p ← p - ᾱₖ * uₖ
+      @kaxpy!(n, -     αₖ , vₖ, q)  # q ← q - αₖ * vₖ
+      @kaxpy!(n, -conj(αₖ), uₖ, p)  # p ← p - ᾱₖ * uₖ
 
-      pᴴq = @kdot(n, p, q)      # pᴴq  = ⟨p,q⟩
-      βₖ₊₁ = √(abs(pᴴq))        # βₖ₊₁ = √(|pᴴq|)
-      γₖ₊₁ = pᴴq / βₖ₊₁         # γₖ₊₁ = pᴴq / βₖ₊₁
+      pᴴq = @kdot(n, p, q)  # pᴴq  = ⟨p,q⟩
+      βₖ₊₁ = √(abs(pᴴq))    # βₖ₊₁ = √(|pᴴq|)
+      γₖ₊₁ = pᴴq / βₖ₊₁     # γₖ₊₁ = pᴴq / βₖ₊₁
 
       # Update the QR factorization of Tₖ₊₁.ₖ = Qₖ [ Rₖ ].
       #                                            [ Oᵀ ]
@@ -357,7 +387,7 @@ kwargs_qmr = (:c, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback,
       breakdown = !solved && (pᴴq == 0)
       timer = time_ns() - start_time
       overtimed = timer > timemax_ns
-      kdisplay(iter, verbose) && @printf(iostream, "%5d  %7.1e  %.2fs\n", iter, rNorm, ktimer(start_time))
+      kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.1e  %7.1e  %.2fs\n", iter, αₖ, rNorm, ktimer(start_time))
     end
     (verbose > 0) && @printf(iostream, "\n")
 
@@ -369,6 +399,10 @@ kwargs_qmr = (:c, :atol, :rtol, :itmax, :timemax, :verbose, :history, :callback,
     overtimed           && (status = "time limit exceeded")
 
     # Update x
+    if !NisI
+      copyto!(solver.s, x)
+      mulorldiv!(x, N, solver.s, ldiv)
+    end
     warm_start && @kaxpy!(n, one(FC), Δx, x)
     solver.warm_start = false
 
diff --git a/src/usymlq.jl b/src/usymlq.jl
index b80f0a622..20fe5809d 100644
--- a/src/usymlq.jl
+++ b/src/usymlq.jl
@@ -38,7 +38,8 @@ USYMLQ determines the least-norm solution of the consistent linear system Ax = b
 USYMLQ is based on the orthogonal tridiagonalization process and requires two initial nonzero vectors `b` and `c`.
 The vector `c` is only used to initialize the process and a default value can be `b` or `Aᴴb` depending on the shape of `A`.
 The error norm ‖x - x*‖ monotonously decreases in USYMLQ.
-It's considered as a generalization of SYMMLQ.
+When `A` is Hermitian and `b = c`, USYMLQ is equivalent to SYMMLQ.
+USYMLQ is considered as a generalization of SYMMLQ.
 
 It can also be applied to under-determined and over-determined problems.
 In all cases, problems must be consistent.
diff --git a/src/usymqr.jl b/src/usymqr.jl
index 0aae23335..8ba7203bc 100644
--- a/src/usymqr.jl
+++ b/src/usymqr.jl
@@ -38,7 +38,8 @@ USYMQR solves Ax = b if it is consistent.
 USYMQR is based on the orthogonal tridiagonalization process and requires two initial nonzero vectors `b` and `c`.
 The vector `c` is only used to initialize the process and a default value can be `b` or `Aᴴb` depending on the shape of `A`.
 The residual norm ‖b - Ax‖ monotonously decreases in USYMQR.
-It's considered as a generalization of MINRES.
+When `A` is Hermitian and `b = c`, QMR is equivalent to MINRES.
+USYMQR is considered as a generalization of MINRES.
 
 It can also be applied to under-determined and over-determined problems.
 USYMQR finds the minimum-norm solution if problems are inconsistent.
diff --git a/test/test_bicgstab.jl b/test/test_bicgstab.jl
index 6817acf3d..bde109fdd 100644
--- a/test/test_bicgstab.jl
+++ b/test/test_bicgstab.jl
@@ -62,7 +62,7 @@
       A, b, M = square_preconditioned(FC=FC)
       (x, stats) = bicgstab(A, b, M=M)
       r = b - A * x
-      resid = norm(r) / norm(b)
+      resid = norm(M * r) / norm(M * b)
       @test(resid ≤ bicgstab_tol)
       @test(stats.solved)
 
@@ -78,7 +78,7 @@
       A, b, M, N = two_preconditioners(500, 32)
       (x, stats) = bicgstab(A, b, M=M, N=N)
       r = b - A * x
-      resid = norm(r) / norm(b)
+      resid = norm(M * r) / norm(M * b)
       @test(resid ≤ bicgstab_tol)
       @test(stats.solved)
 
diff --git a/test/test_bilq.jl b/test/test_bilq.jl
index 40b9872db..672097187 100644
--- a/test/test_bilq.jl
+++ b/test/test_bilq.jl
@@ -66,11 +66,34 @@
       @test(resid ≤ bilq_tol)
       @test(stats.solved)
 
+      # Left preconditioning
+      A, b, M = square_preconditioned(FC=FC)
+      (x, stats) = bilq(A, b, M=M)
+      r = b - A * x
+      resid = norm(M * r) / norm(M * b)
+      @test(resid ≤ bilq_tol)
+      @test(stats.solved)
+
+      # Right preconditioning
+      A, b, N = square_preconditioned(FC=FC)
+      (x, stats) = bilq(A, b, N=N)
+      r = b - A * x
+      resid = norm(r) / norm(b)
+      @test(resid ≤ bilq_tol)
+      @test(stats.solved)
+
+      # Split preconditioning
+      A, b, M, N = two_preconditioners(FC=FC)
+      (x, stats) = bilq(A, b, M=M, N=N)
+      r = b - A * x
+      resid = norm(M * r) / norm(M * b)
+      @test(resid ≤ bilq_tol)
+      @test(stats.solved)
+
       # Test bᴴc == 0
       A, b, c = bc_breakdown(FC=FC)
       (x, stats) = bilq(A, b, c=c)
       @test stats.status == "Breakdown bᴴc = 0"
-
       
       # test callback function
       A, b = polar_poisson(FC=FC)
diff --git a/test/test_cgs.jl b/test/test_cgs.jl
index 832cd76c3..e4aaca4ef 100644
--- a/test/test_cgs.jl
+++ b/test/test_cgs.jl
@@ -62,7 +62,7 @@
       A, b, N = square_preconditioned(FC=FC)
       (x, stats) = cgs(A, b, N=N)
       r = b - A * x
-      resid = norm(r) / norm(b)
+      resid = norm(M * r) / norm(M * b)
       @test(resid ≤ cgs_tol)
       @test(stats.solved)
 
@@ -70,7 +70,7 @@
       A, b, M, N = two_preconditioners(FC=FC)
       (x, stats) = cgs(A, b, M=M, N=N)
       r = b - A * x
-      resid = norm(r) / norm(b)
+      resid = norm(M * r) / norm(M * b)
       @test(resid ≤ cgs_tol)
       @test(stats.solved)
 
diff --git a/test/test_qmr.jl b/test/test_qmr.jl
index 4a6b8c1c9..91fc77dd0 100644
--- a/test/test_qmr.jl
+++ b/test/test_qmr.jl
@@ -58,6 +58,30 @@
       @test(resid ≤ qmr_tol)
       @test(stats.solved)
 
+      # Left preconditioning
+      A, b, M = square_preconditioned(FC=FC)
+      (x, stats) = qmr(A, b, M=M)
+      r = b - A * x
+      resid = norm(M * r) / norm(M * b)
+      @test(resid ≤ qmr_tol)
+      @test(stats.solved)
+
+      # Right preconditioning
+      A, b, N = square_preconditioned(FC=FC)
+      (x, stats) = qmr(A, b, N=N)
+      r = b - A * x
+      resid = norm(r) / norm(b)
+      @test(resid ≤ qmr_tol)
+      @test(stats.solved)
+
+      # Split preconditioning
+      A, b, M, N = two_preconditioners(FC=FC)
+      (x, stats) = qmr(A, b, M=M, N=N)
+      r = b - A * x
+      resid = norm(M * r) / norm(M * b)
+      @test(resid ≤ qmr_tol)
+      @test(stats.solved)
+
       # Test bᴴc == 0
       A, b, c = bc_breakdown(FC=FC)
       (x, stats) = qmr(A, b, c=c)