diff --git a/docs/make.jl b/docs/make.jl index 132b8bbd2..0595a9398 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -12,6 +12,7 @@ makedocs( pages = ["Home" => "index.md", "API" => "api.md", "Krylov processes" => "processes.md", + "Block Krylov processes" => "block_processes.md", "Krylov methods" => ["Hermitian positive definite linear systems" => "solvers/spd.md", "Hermitian indefinite linear systems" => "solvers/sid.md", "Non-Hermitian square linear systems" => "solvers/unsymmetric.md", diff --git a/docs/src/block_processes.md b/docs/src/block_processes.md new file mode 100644 index 000000000..d7977c7f3 --- /dev/null +++ b/docs/src/block_processes.md @@ -0,0 +1,215 @@ +# [Block Krylov processes](@id block-krylov-processes) + +## [Block Hermitian Lanczos](@id block-hermitian-lanczos) + +If the vector $b$ in the [Hermitian Lanczos](@ref hermitian-lanczos) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Hermitian Lanczos process. + +![block_hermitian_lanczos](./graphics/block_hermitian_lanczos.png) + +After $k$ iterations of the block Hermitian Lanczos process, the situation may be summarized as +```math +\begin{align*} + A V_k &= V_{k+1} T_{k+1,k}, \\ + V_k^H V_k &= I_{pk}, +\end{align*} +``` +where $V_k$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}_k^{\square}(A,B)$, +```math +T_{k+1,k} = +\begin{bmatrix} + \Omega_1 & \Psi_2^H & & \\ + \Psi_2 & \Omega_2 & \ddots & \\ + & \ddots & \ddots & \Psi_k^H \\ + & & \Psi_k & \Omega_k \\ + & & & \Psi_{k+1} +\end{bmatrix}. +``` + +```@docs +hermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Block Non-Hermitian Lanczos](@id block-nonhermitian-lanczos) + +If the vectors $b$ and $c$ in the [non-Hermitian Lanczos](@ref nonhermitian-lanczos) process are replaced by matrices $B$ and $C$ with both $p$ columns, we can derive the block non-Hermitian Lanczos process. + +![block_nonhermitian_lanczos](./graphics/block_nonhermitian_lanczos.png) + +After $k$ iterations of the block non-Hermitian Lanczos process, the situation may be summarized as +```math +\begin{align*} + A V_k &= V_{k+1} T_{k+1,k}, \\ + A^H U_k &= U_{k+1} T_{k,k+1}^H, \\ + V_k^H U_k &= U_k^H V_k = I_{pk}, +\end{align*} +``` +where $V_k$ and $U_k$ are bases of the block Krylov subspaces $\mathcal{K}^{\square}_k(A,B)$ and $\mathcal{K}^{\square}_k (A^H,C)$, respectively, +```math +T_{k+1,k} = +\begin{bmatrix} + \Omega_1 & \Phi_2 & & \\ + \Psi_2 & \Omega_2 & \ddots & \\ + & \ddots & \ddots & \Phi_k \\ + & & \Psi_k & \Omega_k \\ + & & & \Psi_{k+1} +\end{bmatrix} +, \qquad +T_{k,k+1}^H = +\begin{bmatrix} + \Omega_1^H & \Psi_2^H & & \\ + \Phi_2^H & \Omega_2^H & \ddots & \\ + & \ddots & \ddots & \Psi_k^H \\ + & & \Phi_k^H & \Omega_k^H \\ + & & & \Phi_{k+1}^H +\end{bmatrix}. +``` + +```@docs +nonhermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Block Arnoldi](@id block-arnoldi) + +If the vector $b$ in the [Arnoldi](@ref arnoldi) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Arnoldi process. + +![block_arnoldi](./graphics/block_arnoldi.png) + +After $k$ iterations of the block Arnoldi process, the situation may be summarized as +```math +\begin{align*} + A V_k &= V_{k+1} H_{k+1,k}, \\ + V_k^H V_k &= I_{pk}, +\end{align*} +``` +where $V_k$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}_k^{\square}(A,B)$, +```math +H_{k+1,k} = +\begin{bmatrix} + \Psi_{1,1}~ & \Psi_{1,2}~ & \ldots & \Psi_{1,k} \\ + \Psi_{2,1}~ & \ddots~ & \ddots & \vdots \\ + & \ddots~ & \ddots & \Psi_{k-1,k} \\ + & & \Psi_{k,k-1} & \Psi_{k,k} \\ + & & & \Psi_{k+1,k} +\end{bmatrix}. +``` + +```@docs +arnoldi(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Block Golub-Kahan](@id block-golub-kahan) + +If the vector $b$ in the [Golub-Kahan](@ref golub-kahan) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Golub-Kahan process. + +![block_golub_kahan](./graphics/block_golub_kahan.png) + +After $k$ iterations of the block Golub-Kahan process, the situation may be summarized as +```math +\begin{align*} + A V_k &= U_{k+1} B_k, \\ + A^H U_{k+1} &= V_{k+1} L_{k+1}^H, \\ + V_k^H V_k &= U_k^H U_k = I_{pk}, +\end{align*} +``` +where $V_k$ and $U_k$ are bases of the block Krylov subspaces $\mathcal{K}_k^{\square}(A^HA,A^HB)$ and $\mathcal{K}_k^{\square}(AA^H,B)$, respectively, +```math +B_k = +\begin{bmatrix} + \Omega_1 & & & \\ + \Psi_2 & \Omega_2 & & \\ + & \ddots & \ddots & \\ + & & \Psi_k & \Omega_k \\ + & & & \Psi_{k+1} \\ +\end{bmatrix} +, \qquad +L_{k+1}^H = +\begin{bmatrix} + \Omega_1^H & \Psi_2^H & & & \\ + & \Omega_2^H & \ddots & & \\ + & & \ddots & \Psi_k^H & \\ + & & & \Omega_k^H & \Psi_{k+1}^H \\ + & & & & \Omega_{k+1}^H \\ +\end{bmatrix}. +``` + +```@docs +golub_kahan(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Block Saunders-Simon-Yip](@id block-saunders-simon-yip) + +If the vectors $b$ and $c$ in the [Saunders-Simon-Yip](@ref saunders-simon-yip) process are replaced by matrices $B$ and $C$ with both $p$ columns, we can derive the block Saunders-Simon-Yip process. + +![block_saunders_simon_yip](./graphics/block_saunders_simon_yip.png) + +After $k$ iterations of the block Saunders-Simon-Yip process, the situation may be summarized as +```math +\begin{align*} + A U_k &= V_{k+1} T_{k+1,k}, \\ + A^H V_k &= U_{k+1} T_{k,k+1}^H, \\ + V_k^H V_k &= U_k^H U_k = I_{pk}, +\end{align*} +``` +where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}, \begin{bmatrix} B & 0 \\ 0 & C \end{bmatrix}\right)$, +```math +T_{k+1,k} = +\begin{bmatrix} + \Omega_1 & \Phi_2 & & \\ + \Psi_2 & \Omega_2 & \ddots & \\ + & \ddots & \ddots & \Phi_k \\ + & & \Psi_k & \Omega_k \\ + & & & \Psi_{k+1} +\end{bmatrix} +, \qquad +T_{k,k+1}^H = +\begin{bmatrix} + \Omega_1^H & \Psi_2^H & & \\ + \Phi_2^H & \Omega_2^H & \ddots & \\ + & \ddots & \ddots & \Psi_k^H \\ + & & \Phi_k^H & \Omega_k^H \\ + & & & \Phi_{k+1}^H +\end{bmatrix}. +``` + +```@docs +saunders_simon_yip(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Block Montoison-Orban](@id block-montoison-orban) + +If the vectors $b$ and $c$ in the [Montoison-Orban](@ref montoison-orban) process are replaced by matrices $D$ and $C$ with both $p$ columns, we can derive the block Montoison-Orban process. + +![block_montoison_orban](./graphics/block_montoison_orban.png) + +After $k$ iterations of the block Montoison-Orban process, the situation may be summarized as +```math +\begin{align*} + A U_k &= V_{k+1} H_{k+1,k}, \\ + B V_k &= U_{k+1} F_{k+1,k}, \\ + V_k^H V_k &= U_k^H U_k = I_{pk}, +\end{align*} +``` +where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ B & 0 \end{bmatrix}, \begin{bmatrix} D & 0 \\ 0 & C \end{bmatrix}\right)$, +```math +H_{k+1,k} = +\begin{bmatrix} + \Psi_{1,1}~ & \Psi_{1,2}~ & \ldots & \Psi_{1,k} \\ + \Psi_{2,1}~ & \ddots~ & \ddots & \vdots \\ + & \ddots~ & \ddots & \Psi_{k-1,k} \\ + & & \Psi_{k,k-1} & \Psi_{k,k} \\ + & & & \Psi_{k+1,k} +\end{bmatrix} +, \qquad +F_{k+1,k} = +\begin{bmatrix} + \Phi_{1,1}~ & \Phi_{1,2}~ & \ldots & \Phi_{1,k} \\ + \Phi_{2,1}~ & \ddots~ & \ddots & \vdots \\ + & \ddots~ & \ddots & \Phi_{k-1,k} \\ + & & \Phi_{k,k-1} & \Phi_{k,k} \\ + & & & \Phi_{k+1,k} +\end{bmatrix}. +``` + +```@docs +montoison_orban(::Any, ::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` diff --git a/docs/src/graphics/block_arnoldi.png b/docs/src/graphics/block_arnoldi.png new file mode 100644 index 000000000..5248046ea Binary files /dev/null and b/docs/src/graphics/block_arnoldi.png differ diff --git a/docs/src/graphics/block_golub_kahan.png b/docs/src/graphics/block_golub_kahan.png new file mode 100644 index 000000000..e749edc54 Binary files /dev/null and b/docs/src/graphics/block_golub_kahan.png differ diff --git a/docs/src/graphics/block_hermitian_lanczos.png b/docs/src/graphics/block_hermitian_lanczos.png new file mode 100644 index 000000000..d16647a84 Binary files /dev/null and b/docs/src/graphics/block_hermitian_lanczos.png differ diff --git a/docs/src/graphics/block_montoison_orban.png b/docs/src/graphics/block_montoison_orban.png new file mode 100644 index 000000000..48d4d754c Binary files /dev/null and b/docs/src/graphics/block_montoison_orban.png differ diff --git a/docs/src/graphics/block_nonhermitian_lanczos.png b/docs/src/graphics/block_nonhermitian_lanczos.png new file mode 100644 index 000000000..a18052224 Binary files /dev/null and b/docs/src/graphics/block_nonhermitian_lanczos.png differ diff --git a/docs/src/graphics/block_saunders_simon_yip.png b/docs/src/graphics/block_saunders_simon_yip.png new file mode 100644 index 000000000..96dcc4615 Binary files /dev/null and b/docs/src/graphics/block_saunders_simon_yip.png differ diff --git a/docs/src/processes.md b/docs/src/processes.md index 2da69d04e..eb72d6fd1 100644 --- a/docs/src/processes.md +++ b/docs/src/processes.md @@ -60,18 +60,18 @@ For matrices $C \in \mathbb{C}^{n \times n} \enspace$ and $\enspace T \in \mathb \left\{\sum_{i=0}^{k-1} C^i T \, \Omega_i \, \middle \vert \, \Omega_i \in \mathbb{C}^{p \times p},~0 \le i \le k-1 \right\}. ``` -## Hermitian Lanczos +## [Hermitian Lanczos](@id hermitian-lanczos) ![hermitian_lanczos](./graphics/hermitian_lanczos.png) After $k$ iterations of the Hermitian Lanczos process, the situation may be summarized as ```math \begin{align*} - A V_k &= V_k T_k + \beta_{k+1,k} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\ + A V_k &= V_k T_k + \beta_{k+1,k} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\ V_k^H V_k &= I_k, \end{align*} ``` -where $V_k$ is an orthonormal basis of the Krylov subspace $\mathcal{K}_k (A,b)$, +where $V_k$ is an orthonormal basis of the Krylov subspace $\mathcal{K}_k(A,b)$, ```math T_k = \begin{bmatrix} @@ -94,17 +94,17 @@ The function [`hermitian_lanczos`](@ref hermitian_lanczos) returns $V_{k+1}$ and Related methods: [`SYMMLQ`](@ref symmlq), [`CG`](@ref cg), [`CR`](@ref cr), [`CAR`](@ref car), [`MINRES`](@ref minres), [`MINRES-QLP`](@ref minres_qlp), [`MINARES`](@ref minares), [`CGLS`](@ref cgls), [`CRLS`](@ref crls), [`CGNE`](@ref cgne), [`CRMR`](@ref crmr), [`CG-LANCZOS`](@ref cg_lanczos) and [`CG-LANCZOS-SHIFT`](@ref cg_lanczos_shift). ```@docs -hermitian_lanczos +hermitian_lanczos(::Any, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) ``` -## Non-Hermitian Lanczos +## [Non-Hermitian Lanczos](@id nonhermitian-lanczos) ![nonhermitian_lanczos](./graphics/nonhermitian_lanczos.png) After $k$ iterations of the non-Hermitian Lanczos process (also named the Lanczos biorthogonalization process), the situation may be summarized as ```math \begin{align*} - A V_k &= V_k T_k + \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\ + A V_k &= V_k T_k + \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\ A^H U_k &= U_k T_k^H + \bar{\gamma}_{k+1} u_{k+1} e_k^T = U_{k+1} T_{k,k+1}^H, \\ V_k^H U_k &= U_k^H V_k = I_k, \end{align*} @@ -140,7 +140,7 @@ Related methods: [`BiLQ`](@ref bilq), [`QMR`](@ref qmr), [`BiLQR`](@ref bilqr), With these scaling factors, the non-Hermitian Lanczos process coincides with the Hermitian Lanczos process when $A = A^H$ and $b = c$. ```@docs -nonhermitian_lanczos +nonhermitian_lanczos(::Any, ::AbstractVector{FC}, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) ``` The non-Hermitian Lanczos process can be also implemented without $A^H$ (transpose-free variant). @@ -169,7 +169,7 @@ Because $\alpha_k = u_k^H A v_k$ and $(\bar{\gamma}_{k+1} u_{k+1})^H (\beta_{k+1 \end{align*} ``` -## Arnoldi +## [Arnoldi](@id arnoldi) ![arnoldi](./graphics/arnoldi.png) @@ -205,17 +205,17 @@ Related methods: [`DIOM`](@ref diom), [`FOM`](@ref fom), [`DQGMRES`](@ref dqgmre The Arnoldi process coincides with the Hermitian Lanczos process when $A$ is Hermitian. ```@docs -arnoldi +arnoldi(::Any, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) ``` -## Golub-Kahan +## [Golub-Kahan](@id golub-kahan) ![golub_kahan](./graphics/golub_kahan.png) After $k$ iterations of the Golub-Kahan bidiagonalization process, the situation may be summarized as ```math \begin{align*} - A V_k &= U_{k+1} B_k, \\ + A V_k &= U_{k+1} B_k, \\ A^H U_{k+1} &= V_k B_k^H + \bar{\alpha}_{k+1} v_{k+1} e_{k+1}^T = V_{k+1} L_{k+1}^H, \\ V_k^H V_k &= U_k^H U_k = I_k, \end{align*} @@ -255,17 +255,17 @@ Related methods: [`LNLQ`](@ref lnlq), [`CRAIG`](@ref craig), [`CRAIGMR`](@ref cr It is also related to the Hermitian Lanczos process applied to $\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}$ with initial vector $\begin{bmatrix} b \\ 0 \end{bmatrix}$. ```@docs -golub_kahan +golub_kahan(::Any, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) ``` -## Saunders-Simon-Yip +## [Saunders-Simon-Yip](@id saunders-simon-yip) ![saunders_simon_yip](./graphics/saunders_simon_yip.png) After $k$ iterations of the Saunders-Simon-Yip process (also named the orthogonal tridiagonalization process), the situation may be summarized as ```math \begin{align*} - A U_k &= V_k T_k + \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\ + A U_k &= V_k T_k + \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\ A^H V_k &= U_k T_k^H + \bar{\gamma}_{k+1} u_{k+1} e_k^T = U_{k+1} T_{k,k+1}^H, \\ V_k^H V_k &= U_k^H U_k = I_k, \end{align*} @@ -296,14 +296,14 @@ The function [`saunders_simon_yip`](@ref saunders_simon_yip) returns $V_{k+1}$, Related methods: [`USYMLQ`](@ref usymlq), [`USYMQR`](@ref usymqr), [`TriLQR`](@ref trilqr), [`TriCG`](@ref tricg) and [`TriMR`](@ref trimr). -```@docs -saunders_simon_yip -``` - !!! note The Saunders-Simon-Yip is equivalent to the block-Lanczos process applied to $\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}$ with initial matrix $\begin{bmatrix} b & 0 \\ 0 & c \end{bmatrix}$. -## Montoison-Orban +```@docs +saunders_simon_yip(::Any, ::AbstractVector{FC}, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) +``` + +## [Montoison-Orban](@id montoison-orban) ![montoison_orban](./graphics/montoison_orban.png) @@ -356,5 +356,5 @@ Related methods: [`GPMR`](@ref gpmr). It also coincides with the Saunders-Simon-Yip process when $B = A^H$. ```@docs -montoison_orban +montoison_orban(::Any, ::Any, ::AbstractVector{FC}, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat) ``` diff --git a/src/Krylov.jl b/src/Krylov.jl index 05069d14b..56ca58f38 100644 --- a/src/Krylov.jl +++ b/src/Krylov.jl @@ -5,7 +5,9 @@ using LinearAlgebra, SparseArrays, Printf include("krylov_utils.jl") include("krylov_stats.jl") include("krylov_solvers.jl") + include("krylov_processes.jl") +include("block_krylov_processes.jl") include("cg.jl") include("cr.jl") diff --git a/src/block_krylov_processes.jl b/src/block_krylov_processes.jl new file mode 100644 index 000000000..0aaefe179 --- /dev/null +++ b/src/block_krylov_processes.jl @@ -0,0 +1,611 @@ +# """ +# Gram-Schmidt orthogonalization for a reduced QR decomposition: +# Q, R = gs(A) +# +# Input : +# A an n-by-k matrix, n ≥ k +# +# Output : +# Q an n-by-k orthonormal matrix: QᴴQ = Iₖ +# R an k-by-k upper triangular matrix: QR = A +# """ +function gs(A::AbstractMatrix{FC}) where FC <: FloatOrComplex + n, k = size(A) + Q = copy(A) + R = zeros(FC, k, k) + v = zeros(FC, n) + gs!(Q, R, v) +end + +function gs!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, v::AbstractVector{FC}) where FC <: FloatOrComplex + n, k = size(Q) + aⱼ = v + for j = 1:k + qⱼ = view(Q,:,j) + aⱼ .= qⱼ + for i = 1:j-1 + qᵢ = view(Q,:,i) + R[i,j] = @kdot(n, qᵢ, aⱼ) # rᵢⱼ = ⟨qᵢ , aⱼ⟩ + @kaxpy!(n, -R[i,j], qᵢ, qⱼ) # qⱼ = qⱼ - rᵢⱼqᵢ + end + R[j,j] = @knrm2(n, qⱼ) # rⱼⱼ = ‖qⱼ‖ + qⱼ ./= R[j,j] # qⱼ = qⱼ / rⱼⱼ + end + return Q, R +end + +# """ +# Modified Gram-Schmidt orthogonalization for a reduced QR decomposition: +# Q, R = mgs(A) +# +# Input : +# A an n-by-k matrix, n ≥ k +# +# # Q an n-by-k orthonormal matrix: QᴴQ = Iₖ +# # R an k-by-k upper triangular matrix: QR = A +# """ +function mgs(A::AbstractMatrix{FC}) where FC <: FloatOrComplex + n, k = size(A) + Q = copy(A) + R = zeros(FC, k, k) + mgs!(Q, R) +end + +function mgs!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}) where FC <: FloatOrComplex + n, k = size(Q) + for i = 1:k + qᵢ = view(Q,:,i) + R[i,i] = @knrm2(n, qᵢ) # rᵢᵢ = ‖qᵢ‖ + qᵢ ./= R[i,i] # qᵢ = qᵢ / rᵢᵢ + for j = i+1:k + qⱼ = view(Q,:,j) + R[i,j] = @kdot(n, qᵢ, qⱼ) # rᵢⱼ = ⟨qᵢ , qⱼ⟩ + @kaxpy!(n, -R[i,j], qᵢ, qⱼ) # qⱼ = qⱼ - rᵢⱼqᵢ + end + end + return Q, R +end + +# Reduced QR factorization with Householder reflections: +# Q, R = householder(A) +# +# Input : +# A an n-by-k matrix, n ≥ k +# +# Output : +# Q an n-by-k orthonormal matrix: QᴴQ = Iₖ +# R an k-by-k upper triangular matrix: QR = A +# function householder(A::AbstractMatrix{FC}) where FC <: FloatOrComplex +# n, k = size(A) +# T = real(FC) +# R = copy(A) +# H = Matrix{FC}(I, n, k) +# τ = zeros(T, k) +# for j = 1:k +# v = view(R,j+1:end,j) +# σ = norm(v) +# if σ == zero(T) && R[j,j] ≥ 0 +# τ[j] = 0 +# elseif σ == 0 && R[j,j] < 0 +# τ[j] = -2 +# else +# μ = sqrt(R[j,j]^2 + σ) +# if R[j,j] ≤ 0 +# γ = R[j,j] - μ +# else +# γ = -σ / (R[j,j] + μ) +# end +# γ² = γ * γ +# τ[j] = 2 * γ² / (σ + γ²) +# hⱼ = view(H,j+1:end,j) +# hⱼ .= v ./ γ +# R[j:n, j] ./= τⱼ +# view(R, j:n, j+1:k) = view(R, j:n, j+1:k) - (τ[j] * hⱼ) * (hⱼ' * view(R, j:n, j+1:k)) +# end + +# c = v[1] ≥ zero(FC) ? one(FC) : -one(FC) +# v[1] += c * norm(v) +# τ[j] = 2 / dot(v, v) +# hⱼ = I - τ[j] * v * v' +# R[j:end,j:end] = hⱼ * R[j:end,j:end] +# end + +# Q = Matrix{FC}(I, k, n) +# for j:-1:k +# vⱼ = view(R, j:k, j) +# τⱼ = τ[j] +# hⱼ = I - τⱼ * vⱼ * vⱼ' +# Q[j:k,j:k] = hⱼ * Q[j:k,j:k] +# end +# return Q, R +# end + +# Reduced QR factorization with Givens reflections: +# Q, R = givens(A) +# +# Input : +# A an n-by-k matrix, n ≥ k +# +# # Q an n-by-k orthonormal matrix: QᴴQ = Iₖ +# # R an k-by-k upper triangular matrix: QR = A +# """ +function givens(A::AbstractMatrix{FC}) where FC <: FloatOrComplex + n, k = size(A) + nr = n*k - div(k*(k+1), 2) + T = real(FC) + Q = copy(A) + R = zeros(FC, k, k) + C = zeros(T, nr) + S = zeros(FC, nr) + givens!(Q, R, C, S) +end + +function givens!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, C::AbstractVector{T}, S::AbstractVector{FC}) where {T <: AbstractFloat, FC <: FloatOrComplex{T}} + n, k = size(Q) + pos = 0 + for j = 1:k + for i = n-1:-1:j + pos += 1 + C[pos], S[pos], Q[i,j] = sym_givens(Q[i,j], Q[i+1,j]) + if j < k + reflect!(view(Q, i, j+1:k), view(Q, i+1, j+1:k), C[pos], S[pos]) + end + end + end + for j = 1:k + for i = 1:j + R[i,j] = Q[i,j] + end + end + Q .= zero(FC) + for i = 1:k + Q[i,i] = one(FC) + end + for j = k:-1:1 + for i = j:n-1 + reflect!(view(Q, i, j:k), view(Q, i+1, j:k), C[pos], S[pos]) + pos -= 1 + end + end + return Q, R +end + +function reduced_qr(A::AbstractMatrix{FC}, algo::String) where FC <: FloatOrComplex + if algo == "gs" + Q, R = gs(A) + elseif algo == "mgs" + Q, R = mgs(A) + elseif algo == "givens" + Q, R = givens(A) + else + error("$algo is not a supported method to perform a reduced QR.") + end + return Q, R +end + +""" + V, T = hermitian_lanczos(A, B, k) + +#### Input arguments + +* `A`: a linear operator that models an Hermitian matrix of dimension n; +* `B`: a matrix of size n × p; +* `k`: the number of iterations of the block Hermitian Lanczos process. + +#### Output arguments + +* `V`: a dense n × p(k+1) matrix; +* `T`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p. +""" +function hermitian_lanczos(A, B::AbstractMatrix{FC}, k::Int; algo::String="mgs") where FC <: FloatOrComplex + m, n = size(A) + t, p = size(B) + + V = zeros(FC, n, (k+1)*p) + T = spzeros(FC, (k+1)*p, k*p) + + α = -one(FC) + β = one(FC) + q = zeros(FC, n, p) + Ωᵢ = zeros(FC, p, p) + + for i = 1:k + pos1 = (i-1)*p + 1 + pos2 = i*p + vᵢ = view(V,:,pos1:pos2) + vᵢ₊₁ = view(V,:,pos1+p:pos2+p) + if i == 1 + Q, Ψᵢ = reduced_qr(B, algo) + vᵢ .= Q + end + mul!(q, A, vᵢ) + if i ≥ 2 + vᵢ₋₁ = view(V,:,pos1-p:pos2-p) + Ψᵢ = T[pos1:pos2,pos1-p:pos2-p] + T[pos1-p:pos2-p,pos1:pos2] .= Ψᵢ' + mul!(q, vᵢ₋₁, Ψᵢ', α, β) # q = q - vᵢ₋₁ * Ψᵢᴴ + end + mul!(Ωᵢ, vᵢ', q) # Ωᵢ = vᵢᴴ * q + mul!(q, vᵢ, Ωᵢ, α, β) # q = q - vᵢ * Ωᵢᴴ + T[pos1:pos2,pos1:pos2] .= Ωᵢ + Q, Ψᵢ₊₁ = reduced_qr(q, algo) + vᵢ₊₁ .= Q + T[pos1+p:pos2+p,pos1:pos2] .= Ψᵢ₊₁ + end + return V, T +end + +""" + V, T, U, Tᴴ = nonhermitian_lanczos(A, B, C, k) + +#### Input arguments + +* `A`: a linear operator that models a square matrix of dimension n; +* `B`: a matrix of size n × p; +* `C`: a matrix of size n × p; +* `k`: the number of iterations of the block non-Hermitian Lanczos process. + +#### Output arguments + +* `V`: a dense n × p(k+1) matrix; +* `T`: a sparse p(k+1) × pk tridiagonal matrix with a bandwidth p; +* `U`: a dense n × p(k+1) matrix; +* `Tᴴ`: a sparse p(k+1) × pk tridiagonal matrix with a bandwidth p. +""" +function nonhermitian_lanczos(A, B::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int) where FC <: FloatOrComplex + m, n = size(A) + t, p = size(B) + Aᴴ = A' + pivoting = VERSION < v"1.9" ? Val{false}() : NoPivot() + + V = zeros(FC, n, (k+1)*p) + U = zeros(FC, n, (k+1)*p) + T = zeros(FC, (k+1)*p, k*p) + Tᴴ = zeros(FC, (k+1)*p, k*p) + + α = -one(FC) + β = one(FC) + qv = zeros(FC, n, p) + qu = zeros(FC, n, p) + Ωᵢ = zeros(FC, p, p) + D = zeros(FC, p, p) + + for i = 1:k + pos1 = (i-1)*p + 1 + pos2 = i*p + pos3 = pos1 + p + pos4 = pos2 + p + vᵢ = view(V,:,pos1:pos2) + vᵢ₊₁ = view(V,:,pos3:pos4) + uᵢ = view(U,:,pos1:pos2) + uᵢ₊₁ = view(U,:,pos3:pos4) + if i == 1 + mul!(D, C', B) # D = Cᴴ * B + F = lu(D, pivoting) + Φᵢ, Ψᵢ = F.L, F.U + # Φᵢ = F.P' * Φᵢ + vᵢ .= (Ψᵢ' \ B')' + uᵢ .= (Φᵢ \ C')' + end + mul!(qv, A, vᵢ) + mul!(qu, Aᴴ, uᵢ) + if i ≥ 2 + pos5 = pos1 - p + pos6 = pos2 - p + vᵢ₋₁ = view(V,:,pos5:pos6) + uᵢ₋₁ = view(U,:,pos5:pos6) + Φᵢ = Tᴴ[pos1:pos2,pos5:pos6]' + T[pos5:pos6,pos1:pos2] = Φᵢ + mul!(qv, vᵢ₋₁, Φᵢ, α, β) # qv = qv - vᵢ₋₁ * Φᵢ + TΨᵢ = T[pos1:pos2,pos5:pos6]' + Tᴴ[pos5:pos6,pos1:pos2] = TΨᵢ + mul!(qu, uᵢ₋₁, TΨᵢ, α, β) # qu = qu - uᵢ₋₁ * Ψᵢᴴ + end + mul!(Ωᵢ, uᵢ', qv) + T[pos1:pos2,pos1:pos2] .= Ωᵢ + Tᴴ[pos1:pos2,pos1:pos2] .= Ωᵢ' + mul!(qv, vᵢ, Ωᵢ, α, β) # qv = qv - vᵢ * Ωᵢ + mul!(qu, uᵢ, Ωᵢ', α, β) # qu = qu - uᵢ * Ωᵢᴴ + mul!(D, qu', qv) # D = quᴴ * qv + F = lu(D, pivoting) + Φᵢ₊₁, Ψᵢ₊₁ = F.L, F.U + # Φᵢ₊₁ = F.P' * Φᵢ₊₁ + vᵢ₊₁ .= (Ψᵢ₊₁' \ qv')' + T[pos3:pos4,pos1:pos2] .= Ψᵢ₊₁ + uᵢ₊₁ .= (Φᵢ₊₁ \ qu')' + Tᴴ[pos3:pos4,pos1:pos2] .= Φᵢ₊₁' + end + return V, T, U, Tᴴ +end + +""" + V, H = arnoldi(A, B, k; reorthogonalization=false) + +#### Input arguments + +* `A`: a linear operator that models a square matrix of dimension n; +* `B`: a matrix of size n × p; +* `k`: the number of iterations of the block Arnoldi process. + +#### Keyword arguments + +* `reorthogonalization`: reorthogonalize the new matrices of the block Krylov basis against all previous matrices. + +#### Output arguments + +* `V`: a dense n × p(k+1) matrix; +* `H`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p. +""" +function arnoldi(A, B::AbstractMatrix{FC}, k::Int; algo::String="mgs", reorthogonalization::Bool=false) where FC <: FloatOrComplex + m, n = size(A) + t, p = size(B) + + V = zeros(FC, n, (k+1)*p) + H = zeros(FC, (k+1)*p, k*p) + + α = -one(FC) + β = one(FC) + q = zeros(FC, n, p) + Ψᵢⱼ = Ψtmp = zeros(FC, p, p) + + for j = 1:k + pos1 = (j-1)*p + 1 + pos2 = j*p + vⱼ = view(V,:,pos1:pos2) + vⱼ₊₁ = view(V,:,pos1+p:pos2+p) + if j == 1 + Q, Γ = reduced_qr(B, algo) + vⱼ .= Q + end + mul!(q, A, vⱼ) + for i = 1:j + pos3 = (i-1)*p + 1 + pos4 = i*p + vᵢ = view(V,:,pos3:pos4) + mul!(Ψᵢⱼ, vᵢ', q) # Ψᵢⱼ = vᵢᴴ * q + mul!(q, vᵢ, Ψᵢⱼ, α, β) # q = q - vᵢ * Ψᵢⱼ + H[pos3:pos4,pos1:pos2] .= Ψᵢⱼ + end + if reorthogonalization + for i = 1:j + pos3 = (i-1)*p + 1 + pos4 = i*p + vᵢ = view(V,:,pos3:pos4) + mul!(Ψtmp, vᵢ', q) # Ψtmp = vᵢᴴ * q + mul!(q, vᵢ, Ψtmp, α, β) # q = q - vᵢ * Ψtmp + H[pos3:pos4,pos1:pos2] .+= Ψtmp + end + end + Q, Ψ = reduced_qr(q, algo) + H[pos1+p:pos2+p,pos1:pos2] .= Ψ + vⱼ₊₁ .= Q + end + return V, H +end + + +""" + V, U, L = golub_kahan(A, B, k) + +#### Input arguments + +* `A`: a linear operator that models a matrix of dimension m × n; +* `B`: a matrix of size m × p; +* `k`: the number of iterations of the block Golub-Kahan process. + +#### Output arguments + +* `V`: a dense n × p(k+1) matrix; +* `U`: a dense m × p(k+1) matrix; +* `L`: a sparse p(k+1) × p(k+1) block lower bidiagonal matrix with a lower bandwidth p. +""" +function golub_kahan(A, B::AbstractMatrix{FC}, k::Int; algo::String="mgs") where FC <: FloatOrComplex + m, n = size(A) + t, p = size(B) + Aᴴ = A' + + V = zeros(FC, n, (k+1)*p) + U = zeros(FC, m, (k+1)*p) + L = spzeros(FC, (k+1)*p, (k+1)*p) + + α = -one(FC) + β = one(FC) + qv = zeros(FC, n, p) + qu = zeros(FC, m, p) + + for i = 1:k + pos1 = (i-1)*p + 1 + pos2 = i*p + pos3 = pos1 + p + pos4 = pos2 + p + vᵢ = view(V,:,pos1:pos2) + vᵢ₊₁ = view(V,:,pos3:pos4) + uᵢ = view(U,:,pos1:pos2) + uᵢ₊₁ = view(U,:,pos3:pos4) + if i == 1 + Qu, Ψᵢ = reduced_qr(B, algo) + uᵢ .= Qu + mul!(qv, Aᴴ, uᵢ) + Qv, TΩᵢ = reduced_qr(qv, algo) + vᵢ .= Qv + L[pos1:pos2,pos1:pos2] .= TΩᵢ' + end + Ωᵢ = L[pos1:pos2,pos1:pos2] + mul!(qu, A, vᵢ) + mul!(qu, uᵢ, Ωᵢ, α, β) # qu = qu - uᵢ * Ωᵢ + Qu, Ψᵢ₊₁ = reduced_qr(qu, algo) + uᵢ₊₁ .= Qu + L[pos3:pos4,pos1:pos2] .= Ψᵢ₊₁ + mul!(qv, Aᴴ, uᵢ₊₁) + mul!(qv, vᵢ, Ψᵢ₊₁', α, β) # qv = qv - vᵢ * Ψᵢ₊₁ᴴ + Qv, TΩᵢ₊₁ = reduced_qr(qv, algo) + vᵢ₊₁ .= Qv + L[pos3:pos4,pos3:pos4] .= TΩᵢ₊₁' + end + return V, U, L +end + +""" + V, T, U, Tᴴ = saunders_simon_yip(A, B, C, k) + +#### Input arguments + +* `A`: a linear operator that models a matrix of dimension m × n; +* `B`: a matrix of size m × p; +* `C`: a matrix of size n × p; +* `k`: the number of iterations of the block Saunders-Simon-Yip process. + +#### Output arguments + +* `V`: a dense m × p(k+1) matrix; +* `T`: a sparse p(k+1) × pk tridiagonal matrix with a bandwidth p; +* `U`: a dense n × p(k+1) matrix; +* `Tᴴ`: a sparse p(k+1) × pk tridiagonal matrix with a bandwidth p. +""" +function saunders_simon_yip(A, B::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int; algo::String="mgs") where FC <: FloatOrComplex + m, n = size(A) + t, p = size(B) + Aᴴ = A' + + V = zeros(FC, m, (k+1)*p) + U = zeros(FC, n, (k+1)*p) + T = zeros(FC, (k+1)*p, k*p) + Tᴴ = zeros(FC, (k+1)*p, k*p) + + α = -one(FC) + β = one(FC) + qv = zeros(FC, m, p) + qu = zeros(FC, n, p) + Ωᵢ = zeros(FC, p, p) + + for i = 1:k + pos1 = (i-1)*p + 1 + pos2 = i*p + pos3 = pos1 + p + pos4 = pos2 + p + vᵢ = view(V,:,pos1:pos2) + vᵢ₊₁ = view(V,:,pos3:pos4) + uᵢ = view(U,:,pos1:pos2) + uᵢ₊₁ = view(U,:,pos3:pos4) + if i == 1 + Qv, Ψᵢ = reduced_qr(B, algo) + vᵢ .= Qv + Qu, TΦᵢ = reduced_qr(C, algo) + uᵢ .= Qu + end + mul!(qv, A, uᵢ) + mul!(qu, Aᴴ, vᵢ) + if i ≥ 2 + pos5 = pos1 - p + pos6 = pos2 - p + vᵢ₋₁ = view(V,:,pos5:pos6) + uᵢ₋₁ = view(U,:,pos5:pos6) + Φᵢ = Tᴴ[pos1:pos2,pos5:pos6]' + T[pos5:pos6,pos1:pos2] = Φᵢ + mul!(qv, vᵢ₋₁, Φᵢ, α, β) # qv = qv - vᵢ₋₁ * Φᵢ + TΨᵢ = T[pos1:pos2,pos5:pos6]' + Tᴴ[pos5:pos6,pos1:pos2] = TΨᵢ + mul!(qu, uᵢ₋₁, TΨᵢ, α, β) # qu = qu - uᵢ₋₁ * Ψᵢᴴ + end + mul!(Ωᵢ, vᵢ', qv) + T[pos1:pos2,pos1:pos2] .= Ωᵢ + Tᴴ[pos1:pos2,pos1:pos2] .= Ωᵢ' + mul!(qv, vᵢ, Ωᵢ, α, β) # qv = qv - vᵢ * Ωᵢ + mul!(qu, uᵢ, Ωᵢ', α, β) # qu = qu - uᵢ * Ωᵢᴴ + Qv, Ψᵢ₊₁ = reduced_qr(qv, algo) + vᵢ₊₁ .= Qv + T[pos3:pos4,pos1:pos2] .= Ψᵢ₊₁ + Qu, TΦᵢ₊₁ = reduced_qr(qu, algo) + uᵢ₊₁ .= Qu + Tᴴ[pos3:pos4,pos1:pos2] .= TΦᵢ₊₁ + end + return V, T, U, Tᴴ +end + +""" + V, H, U, F = montoison_orban(A, B, D, C, k; reorthogonalization=false) + +#### Input arguments + +* `A`: a linear operator that models a matrix of dimension m × n; +* `B`: a linear operator that models a matrix of dimension n × m; +* `D`: a matrix of size m × p; +* `C`: a matrix of size n × p; +* `k`: the number of iterations of the block Montoison-Orban process. + +#### Keyword arguments + +* `reorthogonalization`: reorthogonalize the new matrices of the block Krylov basis against all previous matrices. + +#### Output arguments + +* `V`: a dense m × p(k+1) matrix; +* `H`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p; +* `U`: a dense n × p(k+1) matrix; +* `F`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p. +""" +function montoison_orban(A, B, D::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int; algo::String="mgs", reorthogonalization::Bool=false) where FC <: FloatOrComplex + m, n = size(A) + t, p = size(D) + + V = zeros(FC, m, (k+1)*p) + U = zeros(FC, n, (k+1)*p) + H = zeros(FC, (k+1)*p, k*p) + F = zeros(FC, (k+1)*p, k*p) + + α = -one(FC) + β = one(FC) + qv = zeros(FC, m, p) + qu = zeros(FC, n, p) + Ψᵢⱼ = Φᵢⱼ = Ψtmp = Φtmp = zeros(FC, p, p) + + for j = 1:k + pos1 = (j-1)*p + 1 + pos2 = j*p + vⱼ = view(V,:,pos1:pos2) + vⱼ₊₁ = view(V,:,pos1+p:pos2+p) + uⱼ = view(U,:,pos1:pos2) + uⱼ₊₁ = view(U,:,pos1+p:pos2+p) + if j == 1 + Qv, Γ = reduced_qr(D, algo) + vⱼ .= Qv + Qu, Λ = reduced_qr(C, algo) + uⱼ .= Qu + end + mul!(qv, A, uⱼ) + mul!(qu, B, vⱼ) + for i = 1:j + pos3 = (i-1)*p + 1 + pos4 = i*p + vᵢ = view(V,:,pos3:pos4) + uᵢ = view(U,:,pos3:pos4) + mul!(Ψᵢⱼ, vᵢ', qv) # Ψᵢⱼ = vᵢᴴ * qv + mul!(qv, vᵢ, Ψᵢⱼ, α, β) # qv = qv - vᵢ * Ψᵢⱼ + H[pos3:pos4,pos1:pos2] .= Ψᵢⱼ + mul!(Φᵢⱼ, uᵢ', qu) # Φᵢⱼ = uᵢᴴ * qu + mul!(qu, uᵢ, Φᵢⱼ, α, β) # qu = qu - uᵢ * Φᵢⱼ + F[pos3:pos4,pos1:pos2] .= Φᵢⱼ + end + if reorthogonalization + for i = 1:j + pos3 = (i-1)*p + 1 + pos4 = i*p + vᵢ = view(V,:,pos3:pos4) + uᵢ = view(U,:,pos3:pos4) + mul!(Ψtmp, vᵢ', qv) # Ψtmp = vᵢᴴ * qv + mul!(qv, vᵢ, Ψtmp, α, β) # qv = qv - vᵢ * Ψtmp + H[pos3:pos4,pos1:pos2] .+= Ψtmp + mul!(Φtmp, uᵢ', qu) # Φtmp = uᵢᴴ * qu + mul!(qu, uᵢ, Φtmp, α, β) # qu = qu - uᵢ * Φtmp + F[pos3:pos4,pos1:pos2] .+= Φtmp + end + end + Qv, Ψ = reduced_qr(qv, algo) + H[pos1+p:pos2+p,pos1:pos2] .= Ψ + vⱼ₊₁ .= Qv + Qu, Φ = reduced_qr(qu, algo) + F[pos1+p:pos2+p,pos1:pos2] .= Φ + uⱼ₊₁ .= Qu + end + return V, H, U, F +end diff --git a/test/test_processes.jl b/test/test_processes.jl index 49b2bcd07..4c9646114 100644 --- a/test/test_processes.jl +++ b/test/test_processes.jl @@ -19,7 +19,9 @@ end m = 250 n = 500 k = 20 - + p = 4 + s = 4 + for FC in (Float64, ComplexF64) R = real(FC) nbits_FC = sizeof(FC) @@ -41,6 +43,25 @@ end @test expected_hermitian_lanczos_bytes ≤ actual_hermitian_lanczos_bytes ≤ 1.02 * expected_hermitian_lanczos_bytes end + @testset "Block Hermitian Lanczos" begin + A = rand(FC, n, n) + A = A' * A + B = rand(FC, n, p) + + @testset "algo = $algo" for algo in ("gs", "mgs", "givens") + V, T = hermitian_lanczos(A, B, k; algo) + + @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test A * V[:,1:p*k] ≈ V * T + + # storage_block_hermitian_lanczos_bytes(n, k) = ... + + # expected_block_hermitian_lanczos_bytes = storage_block_hermitian_lanczos_bytes(n, p, k) + # actual_block_hermitian_lanczos_bytes = @allocated hermitian_lanczos(A, B, k; algo) + # @test expected_block_hermitian_lanczos_bytes ≤ actual_block_hermitian_lanczos_bytes ≤ 1.02 * expected_block_hermitian_lanczos_bytes + end + end + @testset "Non-Hermitian Lanczos" begin A, b = nonsymmetric_definite(n, FC=FC) c = -b @@ -57,6 +78,25 @@ end @test expected_nonhermitian_lanczos_bytes ≤ actual_nonhermitian_lanczos_bytes ≤ 1.02 * expected_nonhermitian_lanczos_bytes end + @testset "Block Non-Hermitian Lanczos" begin + A = rand(FC, n, n) + B = rand(FC, n, p) + C = rand(FC, n, p) + V, T, U, Tᴴ = nonhermitian_lanczos(A, B, C, k) + + @test norm(V[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4 + @test norm(U[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test T[1:k*p,1:k*p] ≈ Tᴴ[1:k*p,1:k*p]' + @test A * V[:,1:k*p] ≈ V * T + @test A' * U[:,1:k*p] ≈ U * Tᴴ + + # storage_block_nonhermitian_lanczos_bytes(n, p, k) = ... + + # expected_block_nonhermitian_lanczos_bytes = storage_block_nonhermitian_lanczos_bytes(n, p, k) + # actual_block_nonhermitian_lanczos_bytes = @allocated nonhermitian_lanczos(A, B, C, k) + # @test expected_block_nonhermitian_lanczos_bytes ≤ actual_block_nonhermitian_lanczos_bytes ≤ 1.02 * expected_block_nonhermitian_lanczos_bytes + end + @testset "Arnoldi" begin for reorthogonalization in (false, true) A, b = nonsymmetric_indefinite(n, FC=FC) @@ -74,6 +114,29 @@ end end end + @testset "Block Arnoldi" begin + A = rand(FC, n, n) + B = rand(FC, n, p) + + @testset "algo = $algo" for algo in ("gs", "mgs", "givens") + @testset "reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true) + V, H = arnoldi(A, B, k; algo, reorthogonalization) + + @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test norm(V' * V - I) ≤ 1e-4 + @test A * V[:,1:p*k] ≈ V * H + + # function storage_block_arnoldi_bytes(n, p, k) + # return ... + # end + + # expected_block_arnoldi_bytes = storage_block_arnoldi_bytes(n, p, k) + # actual_block_arnoldi_bytes = @allocated arnoldi(A, B, k; algo, reorthogonalization) + # @test expected_block_arnoldi_bytes ≤ actual_block_arnoldi_bytes ≤ 1.02 * expected_block_arnoldi_bytes + end + end + end + @testset "Golub-Kahan" begin A, b = under_consistent(m, n, FC=FC) V, U, L = golub_kahan(A, b, k) @@ -91,6 +154,29 @@ end @test expected_golub_kahan_bytes ≤ actual_golub_kahan_bytes ≤ 1.02 * expected_golub_kahan_bytes end + @testset "Block Golub-Kahan" begin + A = rand(FC, m, n) + B = rand(FC, m, p) + + @testset "algo = $algo" for algo in ("gs", "mgs", "givens") + V, U, L = golub_kahan(A, B, k; algo) + BL = L[1:(k+1)*p,1:k*p] + + @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4 + @test A * V[:,1:k*p] ≈ U * BL + @test A' * U ≈ V * L' + @test A' * A * V[:,1:k*p] ≈ V * L' * BL + @test A * A' * U[:,1:k*p] ≈ U * BL * L[1:k*p,1:k*p]' + + # storage_block_golub_kahan_bytes(m, n, p, k) = ... + + # expected_block_golub_kahan_bytes = storage_block_golub_kahan_bytes(m, n, p, k) + # actual_block_golub_kahan_bytes = @allocated golub_kahan(A, B, k; algo) + # @test expected_block_golub_kahan_bytes ≤ actual_block_golub_kahan_bytes ≤ 1.02 * expected_block_golub_kahan_bytes + end + end + @testset "Saunders-Simon-Yip" begin A, b = under_consistent(m, n, FC=FC) _, c = over_consistent(n, m, FC=FC) @@ -117,6 +203,30 @@ end @test expected_saunders_simon_yip_bytes ≤ actual_saunders_simon_yip_bytes ≤ 1.02 * expected_saunders_simon_yip_bytes end + @testset "Block Saunders-Simon-Yip" begin + A = rand(FC, m, n) + B = rand(FC, m, p) + C = rand(FC, n, p) + + @testset "algo = $algo" for algo in ("gs", "mgs", "givens") + V, T, U, Tᴴ = saunders_simon_yip(A, B, C, k; algo) + + @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4 + @test T[1:k*p,1:k*p] ≈ Tᴴ[1:k*p,1:k*p]' + @test A * U[:,1:k*p] ≈ V * T + @test A' * V[:,1:k*p] ≈ U * Tᴴ + @test A' * A * U[:,1:(k-1)*p] ≈ U * Tᴴ * T[1:k*p,1:(k-1)*p] + @test A * A' * V[:,1:(k-1)*p] ≈ V * T * Tᴴ[1:k*p,1:(k-1)*p] + + # storage_block_saunders_simon_yip_bytes(m, n, p, k) = ... + + # expected_block_saunders_simon_yip_bytes = storage_block_saunders_simon_yip_bytes(m, n, p, k) + # actual_block_saunders_simon_yip_bytes = @allocated saunders_simon_yip(A, B, C, k; algo) + # @test expected_block_saunders_simon_yip_bytes ≤ actual_block_saunders_simon_yip_bytes ≤ 1.02 * expected_block_saunders_simon_yip_bytes + end + end + @testset "Montoison-Orban" begin for reorthogonalization in (false, true) A, b = under_consistent(m, n, FC=FC) @@ -145,6 +255,34 @@ end @test expected_montoison_orban_bytes ≤ actual_montoison_orban_bytes ≤ 1.02 * expected_montoison_orban_bytes end end + + @testset "Block Montoison-Orban" begin + A = rand(FC, m, n) + B = rand(FC, n, m) + D = rand(FC, m, p) + C = rand(FC, n, p) + + @testset "algo = $algo" for algo in ("gs", "mgs", "givens") + @testset "reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true) + V, H, U, F = montoison_orban(A, B, D, C, k; algo, reorthogonalization) + + @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4 + @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4 + @test A * U[:,1:k*p] ≈ V * H + @test B * V[:,1:k*p] ≈ U * F + @test B * A * U[:,1:(k-1)*p] ≈ U * F * H[1:k*p,1:(k-1)*p] + @test A * B * V[:,1:(k-1)*p] ≈ V * H * F[1:k*p,1:(k-1)*p] + + # function storage_block_montoison_orban_bytes(m, n, p, k) + # return ... + # end + + # expected_block_montoison_orban_bytes = storage_block_montoison_orban_bytes(m, n, p, k) + # actual_block_montoison_orban_bytes = @allocated montoison_orban(A, B, D, C, k; algo, reorthogonalization) + # @test expected_block_montoison_orban_bytes ≤ actual_block_montoison_orban_bytes ≤ 1.02 * expected_block_montoison_orban_bytes + end + end + end end end end