diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index 5664c1d8f..cf5c80003 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -52,7 +52,7 @@ if CUDA.functional()
 end
 ```
 
-If you use a Krylov method that only requires `A * v` products (see @factorization-free), the most efficient format is `CuSparseMatrixCSR`.
+If you use a Krylov method that only requires `A * v` products (see [here](@ref factorization-free)), the most efficient format is `CuSparseMatrixCSR`.
 Optimized operator-vector products that exploit GPU features can be also used by means of linear operators.
 
 Preconditioners, especially incomplete Cholesky or Incomplete LU factorizations that involve triangular solves,
diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl
index 1e202be34..f8f3af1f9 100644
--- a/test/gpu/nvidia.jl
+++ b/test/gpu/nvidia.jl
@@ -21,9 +21,18 @@ include("gpu.jl")
     A_coo_gpu = CuSparseMatrixCOO(A_cpu)
     b_gpu = CuVector(b_cpu)
     b_gpu = CuVector(b_cpu)
-    x_csc, stats_csc = lslq(A_csc_gpu, b_gpu, verbose=1)
-    x_csr, stats_csr = lsqr(A_csr_gpu, b_gpu, verbose=1)
-    x_coo, stats_coo = lsmr(A_coo_gpu, b_gpu, verbose=1)
+    x_csc, stats_csc = lslq(A_csc_gpu, b_gpu)
+    Aᴴr_csc = A_csc_gpu' * (b_gpu - A_csc_gpu * x_csc)
+    println("cas 1")
+    println(norm(Aᴴr_csc))
+    x_csr, stats_csr = lsqr(A_csr_gpu, b_gpu)
+    Aᴴr_csr = A_csr_gpu' * (b_gpu - A_csr_gpu * x_csr)
+    println("cas 2")
+    println(norm(Aᴴr_csr))
+    x_coo, stats_coo = lsmr(A_coo_gpu, b_gpu)
+    Aᴴr_coo = A_coo_gpu' * (b_gpu - A_coo_gpu * x_coo)
+    println("cas 3")
+    println(norm(Aᴴr_coo))
 
     @testset "ic0" begin
       A_cpu, b_cpu = sparse_laplacian()