diff --git a/docs/make.jl b/docs/make.jl
index 132b8bbd2..0595a9398 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -12,6 +12,7 @@ makedocs(
   pages = ["Home" => "index.md",
            "API" => "api.md",
            "Krylov processes" => "processes.md",
+           "Block Krylov processes" => "block_processes.md",
            "Krylov methods" => ["Hermitian positive definite linear systems" => "solvers/spd.md",
                                 "Hermitian indefinite linear systems" => "solvers/sid.md",
                                 "Non-Hermitian square linear systems" => "solvers/unsymmetric.md",
diff --git a/docs/src/block_processes.md b/docs/src/block_processes.md
new file mode 100644
index 000000000..d7977c7f3
--- /dev/null
+++ b/docs/src/block_processes.md
@@ -0,0 +1,215 @@
+# [Block Krylov processes](@id block-krylov-processes)
+
+## [Block Hermitian Lanczos](@id block-hermitian-lanczos)
+
+If the vector $b$ in the [Hermitian Lanczos](@ref hermitian-lanczos) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Hermitian Lanczos process.
+
+![block_hermitian_lanczos](./graphics/block_hermitian_lanczos.png)
+
+After $k$ iterations of the block Hermitian Lanczos process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_{k+1} T_{k+1,k}, \\
+  V_k^H V_k &= I_{pk},
+\end{align*}
+```
+where $V_k$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}_k^{\square}(A,B)$,
+```math
+T_{k+1,k} =
+\begin{bmatrix}
+  \Omega_1 & \Psi_2^H &        &          \\
+  \Psi_2   & \Omega_2 & \ddots &          \\
+           & \ddots   & \ddots & \Psi_k^H \\
+           &          & \Psi_k & \Omega_k \\
+           &          &        & \Psi_{k+1}
+\end{bmatrix}.
+```
+
+```@docs
+hermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Block Non-Hermitian Lanczos](@id block-nonhermitian-lanczos)
+
+If the vectors $b$ and $c$ in the [non-Hermitian Lanczos](@ref nonhermitian-lanczos) process are replaced by matrices $B$ and $C$ with both $p$ columns, we can derive the block non-Hermitian Lanczos process.
+
+![block_nonhermitian_lanczos](./graphics/block_nonhermitian_lanczos.png)
+
+After $k$ iterations of the block non-Hermitian Lanczos process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_{k+1} T_{k+1,k}, \\
+  A^H U_k &= U_{k+1} T_{k,k+1}^H, \\
+  V_k^H U_k &= U_k^H V_k = I_{pk},
+\end{align*}
+```
+where $V_k$ and $U_k$ are bases of the block Krylov subspaces $\mathcal{K}^{\square}_k(A,B)$ and $\mathcal{K}^{\square}_k (A^H,C)$, respectively,
+```math
+T_{k+1,k} =
+\begin{bmatrix}
+  \Omega_1 & \Phi_2   &        &          \\
+  \Psi_2   & \Omega_2 & \ddots &          \\
+           & \ddots   & \ddots & \Phi_k   \\
+           &          & \Psi_k & \Omega_k \\
+           &          &        & \Psi_{k+1}
+\end{bmatrix}
+, \qquad
+T_{k,k+1}^H =
+\begin{bmatrix}
+  \Omega_1^H & \Psi_2^H   &          &            \\
+  \Phi_2^H   & \Omega_2^H & \ddots   &            \\
+             & \ddots     & \ddots   & \Psi_k^H   \\
+             &            & \Phi_k^H & \Omega_k^H \\
+             &            &          & \Phi_{k+1}^H
+\end{bmatrix}.
+```
+
+```@docs
+nonhermitian_lanczos(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Block Arnoldi](@id block-arnoldi)
+
+If the vector $b$ in the [Arnoldi](@ref arnoldi) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Arnoldi process.
+
+![block_arnoldi](./graphics/block_arnoldi.png)
+
+After $k$ iterations of the block Arnoldi process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= V_{k+1} H_{k+1,k}, \\
+  V_k^H V_k &= I_{pk},
+\end{align*}
+```
+where $V_k$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}_k^{\square}(A,B)$,
+```math
+H_{k+1,k} =
+\begin{bmatrix}
+  \Psi_{1,1}~ & \Psi_{1,2}~ & \ldots       & \Psi_{1,k}   \\
+  \Psi_{2,1}~ & \ddots~     & \ddots       & \vdots       \\
+              & \ddots~     & \ddots       & \Psi_{k-1,k} \\
+              &             & \Psi_{k,k-1} & \Psi_{k,k}   \\
+              &             &              & \Psi_{k+1,k}
+\end{bmatrix}.
+```
+
+```@docs
+arnoldi(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Block Golub-Kahan](@id block-golub-kahan)
+
+If the vector $b$ in the [Golub-Kahan](@ref golub-kahan) process is replaced by a matrix $B$ with $p$ columns, we can derive the block Golub-Kahan process.
+
+![block_golub_kahan](./graphics/block_golub_kahan.png)
+
+After $k$ iterations of the block Golub-Kahan process, the situation may be summarized as
+```math
+\begin{align*}
+  A V_k &= U_{k+1} B_k, \\
+  A^H U_{k+1} &= V_{k+1} L_{k+1}^H, \\
+  V_k^H V_k &= U_k^H U_k = I_{pk},
+\end{align*}
+```
+where $V_k$ and $U_k$ are bases of the block Krylov subspaces $\mathcal{K}_k^{\square}(A^HA,A^HB)$ and $\mathcal{K}_k^{\square}(AA^H,B)$, respectively,
+```math
+B_k =
+\begin{bmatrix}
+  \Omega_1 &          &        &            \\
+  \Psi_2   & \Omega_2 &        &            \\
+           & \ddots   & \ddots &            \\
+           &          & \Psi_k & \Omega_k   \\
+           &          &        & \Psi_{k+1} \\
+\end{bmatrix}
+, \qquad
+L_{k+1}^H =
+\begin{bmatrix}
+  \Omega_1^H & \Psi_2^H   &        &            &                \\
+             & \Omega_2^H & \ddots &            &                \\
+             &            & \ddots & \Psi_k^H   &                \\
+             &            &        & \Omega_k^H & \Psi_{k+1}^H   \\
+             &            &        &            & \Omega_{k+1}^H \\
+\end{bmatrix}.
+```
+
+```@docs
+golub_kahan(::Any, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Block Saunders-Simon-Yip](@id block-saunders-simon-yip)
+
+If the vectors $b$ and $c$ in the [Saunders-Simon-Yip](@ref saunders-simon-yip) process are replaced by matrices $B$ and $C$ with both $p$ columns, we can derive the block Saunders-Simon-Yip process.
+
+![block_saunders_simon_yip](./graphics/block_saunders_simon_yip.png)
+
+After $k$ iterations of the block Saunders-Simon-Yip process, the situation may be summarized as
+```math
+\begin{align*}
+  A U_k &= V_{k+1} T_{k+1,k}, \\
+  A^H V_k &= U_{k+1} T_{k,k+1}^H, \\
+  V_k^H V_k &= U_k^H U_k = I_{pk},
+\end{align*}
+```
+where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}, \begin{bmatrix} B & 0 \\ 0 & C \end{bmatrix}\right)$,
+```math
+T_{k+1,k} =
+\begin{bmatrix}
+  \Omega_1 & \Phi_2   &        &          \\
+  \Psi_2   & \Omega_2 & \ddots &          \\
+           & \ddots   & \ddots & \Phi_k   \\
+           &          & \Psi_k & \Omega_k \\
+           &          &        & \Psi_{k+1}
+\end{bmatrix}
+, \qquad
+T_{k,k+1}^H =
+\begin{bmatrix}
+  \Omega_1^H & \Psi_2^H   &          &            \\
+  \Phi_2^H   & \Omega_2^H & \ddots   &            \\
+             & \ddots     & \ddots   & \Psi_k^H   \\
+             &            & \Phi_k^H & \Omega_k^H \\
+             &            &          & \Phi_{k+1}^H
+\end{bmatrix}.
+```
+
+```@docs
+saunders_simon_yip(::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Block Montoison-Orban](@id block-montoison-orban)
+
+If the vectors $b$ and $c$ in the [Montoison-Orban](@ref montoison-orban) process are replaced by matrices $D$ and $C$ with both $p$ columns, we can derive the block Montoison-Orban process.
+
+![block_montoison_orban](./graphics/block_montoison_orban.png)
+
+After $k$ iterations of the block Montoison-Orban process, the situation may be summarized as
+```math
+\begin{align*}
+  A U_k &= V_{k+1} H_{k+1,k}, \\
+  B V_k &= U_{k+1} F_{k+1,k}, \\
+  V_k^H V_k &= U_k^H U_k = I_{pk},
+\end{align*}
+```
+where $\begin{bmatrix} V_k & 0 \\ 0 & U_k \end{bmatrix}$ is an orthonormal basis of the block Krylov subspace $\mathcal{K}^{\square}_k \left(\begin{bmatrix} 0 & A \\ B & 0 \end{bmatrix}, \begin{bmatrix} D & 0 \\ 0 & C \end{bmatrix}\right)$,
+```math
+H_{k+1,k} =
+\begin{bmatrix}
+  \Psi_{1,1}~ & \Psi_{1,2}~ & \ldots       & \Psi_{1,k}   \\
+  \Psi_{2,1}~ & \ddots~     & \ddots       & \vdots       \\
+              & \ddots~     & \ddots       & \Psi_{k-1,k} \\
+              &             & \Psi_{k,k-1} & \Psi_{k,k}   \\
+              &             &              & \Psi_{k+1,k}
+\end{bmatrix}
+, \qquad
+F_{k+1,k} =
+\begin{bmatrix}
+  \Phi_{1,1}~ & \Phi_{1,2}~ & \ldots       & \Phi_{1,k}   \\
+  \Phi_{2,1}~ & \ddots~     & \ddots       & \vdots       \\
+              & \ddots~     & \ddots       & \Phi_{k-1,k} \\
+              &             & \Phi_{k,k-1} & \Phi_{k,k}   \\
+              &             &              & \Phi_{k+1,k}
+\end{bmatrix}.
+```
+
+```@docs
+montoison_orban(::Any, ::Any, ::AbstractMatrix{FC}, ::AbstractMatrix{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
diff --git a/docs/src/graphics/block_arnoldi.png b/docs/src/graphics/block_arnoldi.png
new file mode 100644
index 000000000..5248046ea
Binary files /dev/null and b/docs/src/graphics/block_arnoldi.png differ
diff --git a/docs/src/graphics/block_golub_kahan.png b/docs/src/graphics/block_golub_kahan.png
new file mode 100644
index 000000000..e749edc54
Binary files /dev/null and b/docs/src/graphics/block_golub_kahan.png differ
diff --git a/docs/src/graphics/block_hermitian_lanczos.png b/docs/src/graphics/block_hermitian_lanczos.png
new file mode 100644
index 000000000..d16647a84
Binary files /dev/null and b/docs/src/graphics/block_hermitian_lanczos.png differ
diff --git a/docs/src/graphics/block_montoison_orban.png b/docs/src/graphics/block_montoison_orban.png
new file mode 100644
index 000000000..48d4d754c
Binary files /dev/null and b/docs/src/graphics/block_montoison_orban.png differ
diff --git a/docs/src/graphics/block_nonhermitian_lanczos.png b/docs/src/graphics/block_nonhermitian_lanczos.png
new file mode 100644
index 000000000..a18052224
Binary files /dev/null and b/docs/src/graphics/block_nonhermitian_lanczos.png differ
diff --git a/docs/src/graphics/block_saunders_simon_yip.png b/docs/src/graphics/block_saunders_simon_yip.png
new file mode 100644
index 000000000..96dcc4615
Binary files /dev/null and b/docs/src/graphics/block_saunders_simon_yip.png differ
diff --git a/docs/src/processes.md b/docs/src/processes.md
index 2da69d04e..eb72d6fd1 100644
--- a/docs/src/processes.md
+++ b/docs/src/processes.md
@@ -60,18 +60,18 @@ For matrices $C \in \mathbb{C}^{n \times n} \enspace$ and $\enspace T \in \mathb
 \left\{\sum_{i=0}^{k-1} C^i T \, \Omega_i \, \middle \vert \, \Omega_i \in \mathbb{C}^{p \times p},~0 \le i \le k-1 \right\}.
 ```
 
-## Hermitian Lanczos
+## [Hermitian Lanczos](@id hermitian-lanczos)
 
 ![hermitian_lanczos](./graphics/hermitian_lanczos.png)
 
 After $k$ iterations of the Hermitian Lanczos process, the situation may be summarized as
 ```math
 \begin{align*}
-  A V_k &= V_k T_k + \beta_{k+1,k} v_{k+1} e_k^T = V_{k+1}  T_{k+1,k}, \\
+  A V_k &= V_k T_k + \beta_{k+1,k} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\
   V_k^H V_k &= I_k,
 \end{align*}
 ```
-where $V_k$ is an orthonormal basis of the Krylov subspace $\mathcal{K}_k (A,b)$,
+where $V_k$ is an orthonormal basis of the Krylov subspace $\mathcal{K}_k(A,b)$,
 ```math
 T_k =
 \begin{bmatrix}
@@ -94,17 +94,17 @@ The function [`hermitian_lanczos`](@ref hermitian_lanczos) returns $V_{k+1}$ and
 Related methods: [`SYMMLQ`](@ref symmlq), [`CG`](@ref cg), [`CR`](@ref cr), [`CAR`](@ref car), [`MINRES`](@ref minres), [`MINRES-QLP`](@ref minres_qlp), [`MINARES`](@ref minares), [`CGLS`](@ref cgls), [`CRLS`](@ref crls), [`CGNE`](@ref cgne), [`CRMR`](@ref crmr), [`CG-LANCZOS`](@ref cg_lanczos) and [`CG-LANCZOS-SHIFT`](@ref cg_lanczos_shift).
 
 ```@docs
-hermitian_lanczos
+hermitian_lanczos(::Any, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
 ```
 
-## Non-Hermitian Lanczos
+## [Non-Hermitian Lanczos](@id nonhermitian-lanczos)
 
 ![nonhermitian_lanczos](./graphics/nonhermitian_lanczos.png)
 
 After $k$ iterations of the non-Hermitian Lanczos process (also named the Lanczos biorthogonalization process), the situation may be summarized as
 ```math
 \begin{align*}
-  A V_k &= V_k T_k   +        \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k},   \\
+  A V_k &= V_k T_k + \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\
   A^H U_k &= U_k T_k^H + \bar{\gamma}_{k+1} u_{k+1} e_k^T = U_{k+1} T_{k,k+1}^H, \\
   V_k^H U_k &= U_k^H V_k = I_k,
 \end{align*}
@@ -140,7 +140,7 @@ Related methods: [`BiLQ`](@ref bilq), [`QMR`](@ref qmr), [`BiLQR`](@ref bilqr),
     With these scaling factors, the non-Hermitian Lanczos process coincides with the Hermitian Lanczos process when $A = A^H$ and $b = c$.
 
 ```@docs
-nonhermitian_lanczos
+nonhermitian_lanczos(::Any, ::AbstractVector{FC}, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
 ```
 
 The non-Hermitian Lanczos process can be also implemented without $A^H$ (transpose-free variant).
@@ -169,7 +169,7 @@ Because $\alpha_k = u_k^H A v_k$ and $(\bar{\gamma}_{k+1} u_{k+1})^H (\beta_{k+1
 \end{align*}
 ```
 
-## Arnoldi
+## [Arnoldi](@id arnoldi)
 
 ![arnoldi](./graphics/arnoldi.png)
 
@@ -205,17 +205,17 @@ Related methods: [`DIOM`](@ref diom), [`FOM`](@ref fom), [`DQGMRES`](@ref dqgmre
     The Arnoldi process coincides with the Hermitian Lanczos process when $A$ is Hermitian.
 
 ```@docs
-arnoldi
+arnoldi(::Any, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
 ```
 
-## Golub-Kahan
+## [Golub-Kahan](@id golub-kahan)
 
 ![golub_kahan](./graphics/golub_kahan.png)
 
 After $k$ iterations of the Golub-Kahan bidiagonalization process, the situation may be summarized as
 ```math
 \begin{align*}
-  A V_k &= U_{k+1} B_k,   \\
+  A V_k &= U_{k+1} B_k, \\
   A^H U_{k+1} &= V_k B_k^H + \bar{\alpha}_{k+1} v_{k+1} e_{k+1}^T = V_{k+1} L_{k+1}^H, \\
   V_k^H V_k &= U_k^H U_k = I_k,
 \end{align*}
@@ -255,17 +255,17 @@ Related methods: [`LNLQ`](@ref lnlq), [`CRAIG`](@ref craig), [`CRAIGMR`](@ref cr
     It is also related to the Hermitian Lanczos process applied to $\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}$ with initial vector $\begin{bmatrix} b \\ 0 \end{bmatrix}$.
 
 ```@docs
-golub_kahan
+golub_kahan(::Any, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
 ```
 
-## Saunders-Simon-Yip
+## [Saunders-Simon-Yip](@id saunders-simon-yip)
 
 ![saunders_simon_yip](./graphics/saunders_simon_yip.png)
 
 After $k$ iterations of the Saunders-Simon-Yip process (also named the orthogonal tridiagonalization process), the situation may be summarized as
 ```math
 \begin{align*}
-  A U_k &= V_k T_k   + \beta_{k+1}  v_{k+1} e_k^T = V_{k+1} T_{k+1,k},   \\
+  A U_k &= V_k T_k + \beta_{k+1} v_{k+1} e_k^T = V_{k+1} T_{k+1,k}, \\
   A^H V_k &= U_k T_k^H + \bar{\gamma}_{k+1} u_{k+1} e_k^T = U_{k+1} T_{k,k+1}^H, \\
   V_k^H V_k &= U_k^H U_k = I_k,
 \end{align*}
@@ -296,14 +296,14 @@ The function [`saunders_simon_yip`](@ref saunders_simon_yip) returns $V_{k+1}$,
 
 Related methods: [`USYMLQ`](@ref usymlq), [`USYMQR`](@ref usymqr), [`TriLQR`](@ref trilqr), [`TriCG`](@ref tricg) and [`TriMR`](@ref trimr).
 
-```@docs
-saunders_simon_yip
-```
-
 !!! note
     The Saunders-Simon-Yip is equivalent to the block-Lanczos process applied to $\begin{bmatrix} 0 & A \\ A^H & 0 \end{bmatrix}$ with initial matrix $\begin{bmatrix} b & 0 \\ 0 & c \end{bmatrix}$.
 
-## Montoison-Orban
+```@docs
+saunders_simon_yip(::Any, ::AbstractVector{FC}, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
+```
+
+## [Montoison-Orban](@id montoison-orban)
 
 ![montoison_orban](./graphics/montoison_orban.png)
 
@@ -356,5 +356,5 @@ Related methods: [`GPMR`](@ref gpmr).
     It also coincides with the Saunders-Simon-Yip process when $B = A^H$.
 
 ```@docs
-montoison_orban
+montoison_orban(::Any, ::Any, ::AbstractVector{FC}, ::AbstractVector{FC}, ::Int) where FC <: (Union{Complex{T}, T} where T <: AbstractFloat)
 ```
diff --git a/src/Krylov.jl b/src/Krylov.jl
index 05069d14b..56ca58f38 100644
--- a/src/Krylov.jl
+++ b/src/Krylov.jl
@@ -5,7 +5,9 @@ using LinearAlgebra, SparseArrays, Printf
 include("krylov_utils.jl")
 include("krylov_stats.jl")
 include("krylov_solvers.jl")
+
 include("krylov_processes.jl")
+include("block_krylov_processes.jl")
 
 include("cg.jl")
 include("cr.jl")
diff --git a/src/block_krylov_processes.jl b/src/block_krylov_processes.jl
new file mode 100644
index 000000000..0aaefe179
--- /dev/null
+++ b/src/block_krylov_processes.jl
@@ -0,0 +1,611 @@
+# """
+# Gram-Schmidt orthogonalization for a reduced QR decomposition:
+# Q, R = gs(A)
+#
+# Input :
+# A an n-by-k matrix, n ≥ k
+#
+# Output :
+# Q an n-by-k orthonormal matrix: QᴴQ = Iₖ
+# R an k-by-k upper triangular matrix: QR = A
+# """
+function gs(A::AbstractMatrix{FC}) where FC <: FloatOrComplex
+  n, k = size(A)
+  Q = copy(A)
+  R = zeros(FC, k, k)
+  v = zeros(FC, n)
+  gs!(Q, R, v)
+end
+
+function gs!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, v::AbstractVector{FC}) where FC <: FloatOrComplex
+  n, k = size(Q)
+  aⱼ = v
+  for j = 1:k
+    qⱼ = view(Q,:,j)
+    aⱼ .= qⱼ
+    for i = 1:j-1
+      qᵢ = view(Q,:,i)
+      R[i,j] = @kdot(n, qᵢ, aⱼ)    # rᵢⱼ = ⟨qᵢ , aⱼ⟩
+      @kaxpy!(n, -R[i,j], qᵢ, qⱼ)  # qⱼ = qⱼ - rᵢⱼqᵢ
+    end
+    R[j,j] = @knrm2(n, qⱼ)  # rⱼⱼ = ‖qⱼ‖
+    qⱼ ./= R[j,j]           # qⱼ = qⱼ / rⱼⱼ
+  end
+  return Q, R
+end
+
+# """
+# Modified Gram-Schmidt orthogonalization for a reduced QR decomposition:
+# Q, R = mgs(A)
+#
+# Input :
+# A an n-by-k matrix, n ≥ k
+#
+# # Q an n-by-k orthonormal matrix: QᴴQ = Iₖ
+# # R an k-by-k upper triangular matrix: QR = A
+# """
+function mgs(A::AbstractMatrix{FC}) where FC <: FloatOrComplex
+  n, k = size(A)
+  Q = copy(A)
+  R = zeros(FC, k, k)
+  mgs!(Q, R)
+end
+
+function mgs!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}) where FC <: FloatOrComplex
+  n, k = size(Q)
+  for i = 1:k
+    qᵢ = view(Q,:,i)
+    R[i,i] = @knrm2(n, qᵢ)  # rᵢᵢ = ‖qᵢ‖
+    qᵢ ./= R[i,i]           # qᵢ = qᵢ / rᵢᵢ
+    for j = i+1:k
+      qⱼ = view(Q,:,j)
+      R[i,j] = @kdot(n, qᵢ, qⱼ)    # rᵢⱼ = ⟨qᵢ , qⱼ⟩
+      @kaxpy!(n, -R[i,j], qᵢ, qⱼ)  # qⱼ = qⱼ - rᵢⱼqᵢ
+    end
+  end
+  return Q, R
+end
+
+# Reduced QR factorization with Householder reflections:
+# Q, R = householder(A)
+#
+# Input :
+# A an n-by-k matrix, n ≥ k
+#
+# Output :
+# Q an n-by-k orthonormal matrix: QᴴQ = Iₖ
+# R an k-by-k upper triangular matrix: QR = A
+# function householder(A::AbstractMatrix{FC}) where FC <: FloatOrComplex
+#   n, k = size(A)
+#   T = real(FC)
+#   R = copy(A)
+#   H = Matrix{FC}(I, n, k)
+#   τ = zeros(T, k)
+#   for j = 1:k
+#     v = view(R,j+1:end,j)
+#     σ = norm(v)
+#     if σ == zero(T) && R[j,j] ≥ 0
+#       τ[j] = 0
+#     elseif σ == 0 && R[j,j] < 0
+#       τ[j] = -2
+#     else
+#       μ = sqrt(R[j,j]^2 + σ)
+#       if R[j,j] ≤ 0
+#         γ = R[j,j] - μ
+#       else
+#         γ = -σ / (R[j,j] + μ)
+#       end
+#       γ² = γ * γ
+#       τ[j] = 2 * γ² / (σ + γ²)
+#       hⱼ = view(H,j+1:end,j)
+#       hⱼ .= v ./ γ
+#       R[j:n, j] ./= τⱼ
+#       view(R, j:n, j+1:k) = view(R, j:n, j+1:k) - (τ[j] * hⱼ) * (hⱼ' * view(R, j:n, j+1:k))
+#     end
+
+#     c = v[1] ≥ zero(FC) ? one(FC) : -one(FC)
+#     v[1] += c * norm(v)
+#     τ[j] = 2 / dot(v, v)
+#     hⱼ = I - τ[j] * v * v'
+#     R[j:end,j:end] = hⱼ * R[j:end,j:end]
+#   end
+
+#   Q = Matrix{FC}(I, k, n)
+#   for j:-1:k
+#     vⱼ = view(R, j:k, j)
+#     τⱼ = τ[j]
+#     hⱼ = I - τⱼ * vⱼ * vⱼ'
+#     Q[j:k,j:k] = hⱼ * Q[j:k,j:k]
+#   end
+#   return Q, R
+# end
+
+# Reduced QR factorization with Givens reflections:
+# Q, R = givens(A)
+#
+# Input :
+# A an n-by-k matrix, n ≥ k
+#
+# # Q an n-by-k orthonormal matrix: QᴴQ = Iₖ
+# # R an k-by-k upper triangular matrix: QR = A
+# """
+function givens(A::AbstractMatrix{FC}) where FC <: FloatOrComplex
+  n, k = size(A)
+  nr = n*k - div(k*(k+1), 2)
+  T = real(FC)
+  Q = copy(A)
+  R = zeros(FC, k, k)
+  C = zeros(T, nr)
+  S = zeros(FC, nr)
+  givens!(Q, R, C, S)
+end
+
+function givens!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, C::AbstractVector{T}, S::AbstractVector{FC}) where {T <: AbstractFloat, FC <: FloatOrComplex{T}}
+  n, k = size(Q)
+  pos = 0
+  for j = 1:k
+    for i = n-1:-1:j
+      pos += 1
+      C[pos], S[pos], Q[i,j] = sym_givens(Q[i,j], Q[i+1,j])
+      if j < k
+        reflect!(view(Q, i, j+1:k), view(Q, i+1, j+1:k), C[pos], S[pos])
+      end
+    end
+  end
+  for j = 1:k
+    for i = 1:j
+      R[i,j] = Q[i,j]
+    end
+  end
+  Q .= zero(FC)
+  for i = 1:k
+    Q[i,i] = one(FC)
+  end
+  for j = k:-1:1
+    for i = j:n-1
+      reflect!(view(Q, i, j:k), view(Q, i+1, j:k), C[pos], S[pos])
+      pos -= 1
+    end
+  end
+  return Q, R
+end
+
+function reduced_qr(A::AbstractMatrix{FC}, algo::String) where FC <: FloatOrComplex
+  if algo == "gs"
+    Q, R = gs(A)
+  elseif algo == "mgs"
+    Q, R = mgs(A)
+  elseif algo == "givens"
+    Q, R = givens(A)
+  else
+    error("$algo is not a supported method to perform a reduced QR.")
+  end
+  return Q, R
+end
+
+"""
+    V, T = hermitian_lanczos(A, B, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models an Hermitian matrix of dimension n;
+* `B`: a matrix of size n × p;
+* `k`: the number of iterations of the block Hermitian Lanczos process.
+
+#### Output arguments
+
+* `V`: a dense n × p(k+1) matrix;
+* `T`: a sparse p(k+1) × pk block tridiagonal matrix with a bandwidth p.
+"""
+function hermitian_lanczos(A, B::AbstractMatrix{FC}, k::Int; algo::String="mgs") where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(B)
+
+  V = zeros(FC, n, (k+1)*p)
+  T = spzeros(FC, (k+1)*p, k*p)
+
+  α = -one(FC)
+  β = one(FC)
+  q = zeros(FC, n, p)
+  Ωᵢ = zeros(FC, p, p)
+
+  for i = 1:k
+    pos1 = (i-1)*p + 1
+    pos2 = i*p
+    vᵢ = view(V,:,pos1:pos2)
+    vᵢ₊₁ = view(V,:,pos1+p:pos2+p)
+    if i == 1
+      Q, Ψᵢ = reduced_qr(B, algo)
+      vᵢ .= Q
+    end
+    mul!(q, A, vᵢ)
+    if i ≥ 2
+      vᵢ₋₁ = view(V,:,pos1-p:pos2-p)
+      Ψᵢ = T[pos1:pos2,pos1-p:pos2-p]
+      T[pos1-p:pos2-p,pos1:pos2] .= Ψᵢ'
+      mul!(q, vᵢ₋₁, Ψᵢ', α, β)  # q = q - vᵢ₋₁ * Ψᵢᴴ
+    end
+    mul!(Ωᵢ, vᵢ', q)       # Ωᵢ = vᵢᴴ * q
+    mul!(q, vᵢ, Ωᵢ, α, β)  # q = q - vᵢ * Ωᵢᴴ
+    T[pos1:pos2,pos1:pos2] .= Ωᵢ
+    Q, Ψᵢ₊₁ = reduced_qr(q, algo)
+    vᵢ₊₁ .= Q
+    T[pos1+p:pos2+p,pos1:pos2] .= Ψᵢ₊₁
+  end
+  return V, T
+end
+
+"""
+    V, T, U, Tᴴ = nonhermitian_lanczos(A, B, C, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models a square matrix of dimension n;
+* `B`: a matrix of size n × p;
+* `C`: a matrix of size n × p;
+* `k`: the number of iterations of the block non-Hermitian Lanczos process.
+
+#### Output arguments
+
+* `V`: a dense n × p(k+1) matrix;
+* `T`: a sparse p(k+1) × pk tridiagonal matrix with a bandwidth p;
+* `U`: a dense n × p(k+1) matrix;
+* `Tᴴ`: a sparse p(k+1) × pk tridiagonal matrix with a bandwidth p.
+"""
+function nonhermitian_lanczos(A, B::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int) where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(B)
+  Aᴴ = A'
+  pivoting = VERSION < v"1.9" ? Val{false}() : NoPivot()
+
+  V = zeros(FC, n, (k+1)*p)
+  U = zeros(FC, n, (k+1)*p)
+  T = zeros(FC, (k+1)*p, k*p)
+  Tᴴ = zeros(FC, (k+1)*p, k*p)
+
+  α = -one(FC)
+  β = one(FC)
+  qv = zeros(FC, n, p)
+  qu = zeros(FC, n, p)
+  Ωᵢ = zeros(FC, p, p)
+  D  = zeros(FC, p, p)
+
+  for i = 1:k
+    pos1 = (i-1)*p + 1
+    pos2 = i*p
+    pos3 = pos1 + p
+    pos4 = pos2 + p
+    vᵢ = view(V,:,pos1:pos2)
+    vᵢ₊₁ = view(V,:,pos3:pos4)
+    uᵢ = view(U,:,pos1:pos2)
+    uᵢ₊₁ = view(U,:,pos3:pos4)
+    if i == 1
+      mul!(D, C', B)  # D = Cᴴ * B
+      F = lu(D, pivoting)
+      Φᵢ, Ψᵢ = F.L, F.U
+      # Φᵢ = F.P' * Φᵢ
+      vᵢ .= (Ψᵢ' \ B')'
+      uᵢ .= (Φᵢ \ C')'
+    end
+    mul!(qv, A, vᵢ)
+    mul!(qu, Aᴴ, uᵢ)
+    if i ≥ 2
+      pos5 = pos1 - p
+      pos6 = pos2 - p
+      vᵢ₋₁ = view(V,:,pos5:pos6)
+      uᵢ₋₁ = view(U,:,pos5:pos6)
+      Φᵢ = Tᴴ[pos1:pos2,pos5:pos6]'
+      T[pos5:pos6,pos1:pos2] = Φᵢ
+      mul!(qv, vᵢ₋₁, Φᵢ, α, β)  # qv = qv - vᵢ₋₁ * Φᵢ
+      TΨᵢ = T[pos1:pos2,pos5:pos6]'
+      Tᴴ[pos5:pos6,pos1:pos2] = TΨᵢ
+      mul!(qu, uᵢ₋₁, TΨᵢ, α, β)  # qu = qu - uᵢ₋₁ * Ψᵢᴴ
+    end
+    mul!(Ωᵢ, uᵢ', qv)
+    T[pos1:pos2,pos1:pos2] .= Ωᵢ
+    Tᴴ[pos1:pos2,pos1:pos2] .= Ωᵢ'
+    mul!(qv, vᵢ, Ωᵢ, α, β)   # qv = qv - vᵢ * Ωᵢ
+    mul!(qu, uᵢ, Ωᵢ', α, β)  # qu = qu - uᵢ * Ωᵢᴴ
+    mul!(D, qu', qv)         # D = quᴴ * qv
+    F = lu(D, pivoting)
+    Φᵢ₊₁, Ψᵢ₊₁ = F.L, F.U
+    # Φᵢ₊₁ = F.P' * Φᵢ₊₁
+    vᵢ₊₁ .= (Ψᵢ₊₁' \ qv')'
+    T[pos3:pos4,pos1:pos2] .= Ψᵢ₊₁
+    uᵢ₊₁ .= (Φᵢ₊₁ \ qu')'
+    Tᴴ[pos3:pos4,pos1:pos2] .= Φᵢ₊₁'
+  end
+  return V, T, U, Tᴴ
+end
+
+"""
+    V, H = arnoldi(A, B, k; reorthogonalization=false)
+
+#### Input arguments
+
+* `A`: a linear operator that models a square matrix of dimension n;
+* `B`: a matrix of size n × p;
+* `k`: the number of iterations of the block Arnoldi process.
+
+#### Keyword arguments
+
+* `reorthogonalization`: reorthogonalize the new matrices of the block Krylov basis against all previous matrices.
+
+#### Output arguments
+
+* `V`: a dense n × p(k+1) matrix;
+* `H`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p.
+"""
+function arnoldi(A, B::AbstractMatrix{FC}, k::Int; algo::String="mgs", reorthogonalization::Bool=false) where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(B)
+
+  V = zeros(FC, n, (k+1)*p)
+  H = zeros(FC, (k+1)*p, k*p)
+
+  α = -one(FC)
+  β = one(FC)
+  q = zeros(FC, n, p)
+  Ψᵢⱼ = Ψtmp = zeros(FC, p, p)
+
+  for j = 1:k
+    pos1 = (j-1)*p + 1
+    pos2 = j*p
+    vⱼ = view(V,:,pos1:pos2)
+    vⱼ₊₁ = view(V,:,pos1+p:pos2+p)
+    if j == 1
+      Q, Γ = reduced_qr(B, algo)
+      vⱼ .= Q
+    end
+    mul!(q, A, vⱼ)
+    for i = 1:j
+      pos3 = (i-1)*p + 1
+      pos4 = i*p
+      vᵢ = view(V,:,pos3:pos4)
+      mul!(Ψᵢⱼ, vᵢ', q)       # Ψᵢⱼ = vᵢᴴ * q
+      mul!(q, vᵢ, Ψᵢⱼ, α, β)  # q = q - vᵢ * Ψᵢⱼ
+      H[pos3:pos4,pos1:pos2] .= Ψᵢⱼ
+    end
+    if reorthogonalization
+      for i = 1:j
+        pos3 = (i-1)*p + 1
+        pos4 = i*p
+        vᵢ = view(V,:,pos3:pos4)
+        mul!(Ψtmp, vᵢ', q)       # Ψtmp = vᵢᴴ * q
+        mul!(q, vᵢ, Ψtmp, α, β)  # q = q - vᵢ * Ψtmp
+        H[pos3:pos4,pos1:pos2] .+= Ψtmp
+      end
+    end
+    Q, Ψ = reduced_qr(q, algo)
+    H[pos1+p:pos2+p,pos1:pos2] .= Ψ
+    vⱼ₊₁ .= Q
+  end
+  return V, H
+end
+
+
+"""
+    V, U, L = golub_kahan(A, B, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `B`: a matrix of size m × p;
+* `k`: the number of iterations of the block Golub-Kahan process.
+
+#### Output arguments
+
+* `V`: a dense n × p(k+1) matrix;
+* `U`: a dense m × p(k+1) matrix;
+* `L`: a sparse p(k+1) × p(k+1) block lower bidiagonal matrix with a lower bandwidth p.
+"""
+function golub_kahan(A, B::AbstractMatrix{FC}, k::Int; algo::String="mgs") where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(B)
+  Aᴴ = A'
+
+  V = zeros(FC, n, (k+1)*p)
+  U = zeros(FC, m, (k+1)*p)
+  L = spzeros(FC, (k+1)*p, (k+1)*p)
+
+  α = -one(FC)
+  β = one(FC)
+  qv = zeros(FC, n, p)
+  qu = zeros(FC, m, p)
+
+  for i = 1:k
+    pos1 = (i-1)*p + 1
+    pos2 = i*p
+    pos3 = pos1 + p
+    pos4 = pos2 + p
+    vᵢ = view(V,:,pos1:pos2)
+    vᵢ₊₁ = view(V,:,pos3:pos4)
+    uᵢ = view(U,:,pos1:pos2)
+    uᵢ₊₁ = view(U,:,pos3:pos4)
+    if i == 1
+      Qu, Ψᵢ = reduced_qr(B, algo)
+      uᵢ .= Qu
+      mul!(qv, Aᴴ, uᵢ)
+      Qv, TΩᵢ = reduced_qr(qv, algo)
+      vᵢ .= Qv
+      L[pos1:pos2,pos1:pos2] .= TΩᵢ'
+    end
+    Ωᵢ = L[pos1:pos2,pos1:pos2]
+    mul!(qu, A, vᵢ)
+    mul!(qu, uᵢ, Ωᵢ, α, β)  # qu = qu - uᵢ * Ωᵢ
+    Qu, Ψᵢ₊₁ = reduced_qr(qu, algo)
+    uᵢ₊₁ .= Qu
+    L[pos3:pos4,pos1:pos2] .= Ψᵢ₊₁
+    mul!(qv, Aᴴ, uᵢ₊₁)
+    mul!(qv, vᵢ, Ψᵢ₊₁', α, β)  # qv = qv - vᵢ * Ψᵢ₊₁ᴴ
+    Qv, TΩᵢ₊₁ = reduced_qr(qv, algo)
+    vᵢ₊₁ .= Qv
+    L[pos3:pos4,pos3:pos4] .= TΩᵢ₊₁'
+  end
+  return V, U, L
+end
+
+"""
+    V, T, U, Tᴴ = saunders_simon_yip(A, B, C, k)
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `B`: a matrix of size m × p;
+* `C`: a matrix of size n × p;
+* `k`: the number of iterations of the block Saunders-Simon-Yip process.
+
+#### Output arguments
+
+* `V`: a dense m × p(k+1) matrix;
+* `T`: a sparse p(k+1) × pk tridiagonal matrix with a bandwidth p;
+* `U`: a dense n × p(k+1) matrix;
+* `Tᴴ`: a sparse p(k+1) × pk tridiagonal matrix with a bandwidth p.
+"""
+function saunders_simon_yip(A, B::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int; algo::String="mgs") where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(B)
+  Aᴴ = A'
+
+  V = zeros(FC, m, (k+1)*p)
+  U = zeros(FC, n, (k+1)*p)
+  T = zeros(FC, (k+1)*p, k*p)
+  Tᴴ = zeros(FC, (k+1)*p, k*p)
+
+  α = -one(FC)
+  β = one(FC)
+  qv = zeros(FC, m, p)
+  qu = zeros(FC, n, p)
+  Ωᵢ = zeros(FC, p, p)
+
+  for i = 1:k
+    pos1 = (i-1)*p + 1
+    pos2 = i*p
+    pos3 = pos1 + p
+    pos4 = pos2 + p
+    vᵢ = view(V,:,pos1:pos2)
+    vᵢ₊₁ = view(V,:,pos3:pos4)
+    uᵢ = view(U,:,pos1:pos2)
+    uᵢ₊₁ = view(U,:,pos3:pos4)
+    if i == 1
+      Qv, Ψᵢ = reduced_qr(B, algo)
+      vᵢ .= Qv
+      Qu, TΦᵢ = reduced_qr(C, algo)
+      uᵢ .= Qu
+    end
+    mul!(qv, A, uᵢ)
+    mul!(qu, Aᴴ, vᵢ)
+    if i ≥ 2
+      pos5 = pos1 - p
+      pos6 = pos2 - p
+      vᵢ₋₁ = view(V,:,pos5:pos6)
+      uᵢ₋₁ = view(U,:,pos5:pos6)
+      Φᵢ = Tᴴ[pos1:pos2,pos5:pos6]'
+      T[pos5:pos6,pos1:pos2] = Φᵢ
+      mul!(qv, vᵢ₋₁, Φᵢ, α, β)  # qv = qv - vᵢ₋₁ * Φᵢ
+      TΨᵢ = T[pos1:pos2,pos5:pos6]'
+      Tᴴ[pos5:pos6,pos1:pos2] = TΨᵢ
+      mul!(qu, uᵢ₋₁, TΨᵢ, α, β)  # qu = qu - uᵢ₋₁ * Ψᵢᴴ
+    end
+    mul!(Ωᵢ, vᵢ', qv)
+    T[pos1:pos2,pos1:pos2] .= Ωᵢ
+    Tᴴ[pos1:pos2,pos1:pos2] .= Ωᵢ'
+    mul!(qv, vᵢ, Ωᵢ, α, β)   # qv = qv - vᵢ * Ωᵢ
+    mul!(qu, uᵢ, Ωᵢ', α, β)  # qu = qu - uᵢ * Ωᵢᴴ
+    Qv, Ψᵢ₊₁ = reduced_qr(qv, algo)
+    vᵢ₊₁ .= Qv
+    T[pos3:pos4,pos1:pos2] .= Ψᵢ₊₁
+    Qu, TΦᵢ₊₁ = reduced_qr(qu, algo)
+    uᵢ₊₁ .= Qu
+    Tᴴ[pos3:pos4,pos1:pos2] .= TΦᵢ₊₁
+  end
+  return V, T, U, Tᴴ
+end
+
+"""
+    V, H, U, F = montoison_orban(A, B, D, C, k; reorthogonalization=false)
+
+#### Input arguments
+
+* `A`: a linear operator that models a matrix of dimension m × n;
+* `B`: a linear operator that models a matrix of dimension n × m;
+* `D`: a matrix of size m × p;
+* `C`: a matrix of size n × p;
+* `k`: the number of iterations of the block Montoison-Orban process.
+
+#### Keyword arguments
+
+* `reorthogonalization`: reorthogonalize the new matrices of the block Krylov basis against all previous matrices.
+
+#### Output arguments
+
+* `V`: a dense m × p(k+1) matrix;
+* `H`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p;
+* `U`: a dense n × p(k+1) matrix;
+* `F`: a dense p(k+1) × pk block upper Hessenberg matrix with a lower bandwidth p.
+"""
+function montoison_orban(A, B, D::AbstractMatrix{FC}, C::AbstractMatrix{FC}, k::Int; algo::String="mgs", reorthogonalization::Bool=false) where FC <: FloatOrComplex
+  m, n = size(A)
+  t, p = size(D)
+
+  V = zeros(FC, m, (k+1)*p)
+  U = zeros(FC, n, (k+1)*p)
+  H = zeros(FC, (k+1)*p, k*p)
+  F = zeros(FC, (k+1)*p, k*p)
+
+  α = -one(FC)
+  β = one(FC)
+  qv = zeros(FC, m, p)
+  qu = zeros(FC, n, p)
+  Ψᵢⱼ = Φᵢⱼ = Ψtmp = Φtmp = zeros(FC, p, p)
+
+  for j = 1:k
+    pos1 = (j-1)*p + 1
+    pos2 = j*p
+    vⱼ = view(V,:,pos1:pos2)
+    vⱼ₊₁ = view(V,:,pos1+p:pos2+p)
+    uⱼ = view(U,:,pos1:pos2)
+    uⱼ₊₁ = view(U,:,pos1+p:pos2+p)
+    if j == 1
+      Qv, Γ = reduced_qr(D, algo)
+      vⱼ .= Qv
+      Qu, Λ = reduced_qr(C, algo)
+      uⱼ .= Qu
+    end
+    mul!(qv, A, uⱼ)
+    mul!(qu, B, vⱼ)
+    for i = 1:j
+      pos3 = (i-1)*p + 1
+      pos4 = i*p
+      vᵢ = view(V,:,pos3:pos4)
+      uᵢ = view(U,:,pos3:pos4)
+      mul!(Ψᵢⱼ, vᵢ', qv)       # Ψᵢⱼ = vᵢᴴ * qv
+      mul!(qv, vᵢ, Ψᵢⱼ, α, β)  # qv = qv - vᵢ * Ψᵢⱼ
+      H[pos3:pos4,pos1:pos2] .= Ψᵢⱼ
+      mul!(Φᵢⱼ, uᵢ', qu)       # Φᵢⱼ = uᵢᴴ * qu
+      mul!(qu, uᵢ, Φᵢⱼ, α, β)  # qu = qu - uᵢ * Φᵢⱼ
+      F[pos3:pos4,pos1:pos2] .= Φᵢⱼ
+    end
+    if reorthogonalization
+      for i = 1:j
+        pos3 = (i-1)*p + 1
+        pos4 = i*p
+        vᵢ = view(V,:,pos3:pos4)
+        uᵢ = view(U,:,pos3:pos4)
+        mul!(Ψtmp, vᵢ', qv)       # Ψtmp = vᵢᴴ * qv
+        mul!(qv, vᵢ, Ψtmp, α, β)  # qv = qv - vᵢ * Ψtmp
+        H[pos3:pos4,pos1:pos2] .+= Ψtmp
+        mul!(Φtmp, uᵢ', qu)       # Φtmp = uᵢᴴ * qu
+        mul!(qu, uᵢ, Φtmp, α, β)  # qu = qu - uᵢ * Φtmp
+        F[pos3:pos4,pos1:pos2] .+= Φtmp
+      end
+    end
+    Qv, Ψ = reduced_qr(qv, algo)
+    H[pos1+p:pos2+p,pos1:pos2] .= Ψ
+    vⱼ₊₁ .= Qv
+    Qu, Φ = reduced_qr(qu, algo)
+    F[pos1+p:pos2+p,pos1:pos2] .= Φ
+    uⱼ₊₁ .= Qu
+  end
+  return V, H, U, F
+end
diff --git a/test/test_processes.jl b/test/test_processes.jl
index 49b2bcd07..4c9646114 100644
--- a/test/test_processes.jl
+++ b/test/test_processes.jl
@@ -19,7 +19,9 @@ end
   m = 250
   n = 500
   k = 20
-  
+  p = 4
+  s = 4
+
   for FC in (Float64, ComplexF64)
     R = real(FC)
     nbits_FC = sizeof(FC)
@@ -41,6 +43,25 @@ end
         @test expected_hermitian_lanczos_bytes ≤ actual_hermitian_lanczos_bytes ≤ 1.02 * expected_hermitian_lanczos_bytes
       end
 
+      @testset "Block Hermitian Lanczos" begin
+        A = rand(FC, n, n)
+        A = A' * A
+        B = rand(FC, n, p)
+
+        @testset "algo = $algo" for algo in ("gs", "mgs", "givens")
+          V, T = hermitian_lanczos(A, B, k; algo)
+
+          @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+          @test A * V[:,1:p*k] ≈ V * T
+
+          # storage_block_hermitian_lanczos_bytes(n, k) = ...
+
+          # expected_block_hermitian_lanczos_bytes = storage_block_hermitian_lanczos_bytes(n, p, k)
+          # actual_block_hermitian_lanczos_bytes = @allocated hermitian_lanczos(A, B, k; algo)
+          # @test expected_block_hermitian_lanczos_bytes ≤ actual_block_hermitian_lanczos_bytes ≤ 1.02 * expected_block_hermitian_lanczos_bytes
+        end
+      end
+
       @testset "Non-Hermitian Lanczos" begin
         A, b = nonsymmetric_definite(n, FC=FC)
         c = -b
@@ -57,6 +78,25 @@ end
         @test expected_nonhermitian_lanczos_bytes ≤ actual_nonhermitian_lanczos_bytes ≤ 1.02 * expected_nonhermitian_lanczos_bytes
       end
 
+      @testset "Block Non-Hermitian Lanczos" begin
+        A = rand(FC, n, n)
+        B = rand(FC, n, p)
+        C = rand(FC, n, p)
+        V, T, U, Tᴴ = nonhermitian_lanczos(A, B, C, k)
+
+        @test norm(V[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4
+        @test norm(U[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+        @test T[1:k*p,1:k*p] ≈ Tᴴ[1:k*p,1:k*p]'
+        @test A  * V[:,1:k*p] ≈ V * T
+        @test A' * U[:,1:k*p] ≈ U * Tᴴ
+
+        # storage_block_nonhermitian_lanczos_bytes(n, p, k) = ...
+
+        # expected_block_nonhermitian_lanczos_bytes = storage_block_nonhermitian_lanczos_bytes(n, p, k)
+        # actual_block_nonhermitian_lanczos_bytes = @allocated nonhermitian_lanczos(A, B, C, k)
+        # @test expected_block_nonhermitian_lanczos_bytes ≤ actual_block_nonhermitian_lanczos_bytes ≤ 1.02 * expected_block_nonhermitian_lanczos_bytes
+      end
+
       @testset "Arnoldi" begin
         for reorthogonalization in (false, true)
           A, b = nonsymmetric_indefinite(n, FC=FC)
@@ -74,6 +114,29 @@ end
         end
       end
 
+      @testset "Block Arnoldi" begin
+        A = rand(FC, n, n)
+        B = rand(FC, n, p)
+
+        @testset "algo = $algo" for algo in ("gs", "mgs", "givens")
+          @testset "reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true)
+            V, H = arnoldi(A, B, k; algo, reorthogonalization)
+
+            @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+            @test norm(V' * V - I) ≤ 1e-4
+            @test A * V[:,1:p*k] ≈ V * H
+
+            # function storage_block_arnoldi_bytes(n, p, k)
+            #   return ...
+            # end
+
+            # expected_block_arnoldi_bytes = storage_block_arnoldi_bytes(n, p, k)
+            # actual_block_arnoldi_bytes = @allocated arnoldi(A, B, k; algo, reorthogonalization)
+            # @test expected_block_arnoldi_bytes ≤ actual_block_arnoldi_bytes ≤ 1.02 * expected_block_arnoldi_bytes
+          end
+        end
+      end
+
       @testset "Golub-Kahan" begin
         A, b = under_consistent(m, n, FC=FC)
         V, U, L = golub_kahan(A, b, k)
@@ -91,6 +154,29 @@ end
         @test expected_golub_kahan_bytes ≤ actual_golub_kahan_bytes ≤ 1.02 * expected_golub_kahan_bytes
       end
 
+      @testset "Block Golub-Kahan" begin
+        A = rand(FC, m, n)
+        B = rand(FC, m, p)
+
+        @testset "algo = $algo" for algo in ("gs", "mgs", "givens")
+          V, U, L = golub_kahan(A, B, k; algo)
+          BL = L[1:(k+1)*p,1:k*p]
+
+          @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+          @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4
+          @test A  * V[:,1:k*p] ≈ U * BL
+          @test A' * U ≈ V * L'
+          @test A' * A  * V[:,1:k*p] ≈ V * L' * BL
+          @test A  * A' * U[:,1:k*p] ≈ U * BL * L[1:k*p,1:k*p]'
+
+          # storage_block_golub_kahan_bytes(m, n, p, k) = ...
+
+          # expected_block_golub_kahan_bytes = storage_block_golub_kahan_bytes(m, n, p, k)
+          # actual_block_golub_kahan_bytes = @allocated golub_kahan(A, B, k; algo)
+          # @test expected_block_golub_kahan_bytes ≤ actual_block_golub_kahan_bytes ≤ 1.02 * expected_block_golub_kahan_bytes
+        end
+      end
+
       @testset "Saunders-Simon-Yip" begin
         A, b = under_consistent(m, n, FC=FC)
         _, c = over_consistent(n, m, FC=FC)
@@ -117,6 +203,30 @@ end
         @test expected_saunders_simon_yip_bytes ≤ actual_saunders_simon_yip_bytes ≤ 1.02 * expected_saunders_simon_yip_bytes
       end
 
+      @testset "Block Saunders-Simon-Yip" begin
+        A = rand(FC, m, n)
+        B = rand(FC, m, p)
+        C = rand(FC, n, p)
+
+        @testset "algo = $algo" for algo in ("gs", "mgs", "givens")
+          V, T, U, Tᴴ = saunders_simon_yip(A, B, C, k; algo)
+
+          @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+          @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4
+          @test T[1:k*p,1:k*p] ≈ Tᴴ[1:k*p,1:k*p]'
+          @test A  * U[:,1:k*p] ≈ V * T
+          @test A' * V[:,1:k*p] ≈ U * Tᴴ
+          @test A' * A  * U[:,1:(k-1)*p] ≈ U * Tᴴ * T[1:k*p,1:(k-1)*p]
+          @test A  * A' * V[:,1:(k-1)*p] ≈ V * T * Tᴴ[1:k*p,1:(k-1)*p]
+
+          # storage_block_saunders_simon_yip_bytes(m, n, p, k) = ...
+
+          # expected_block_saunders_simon_yip_bytes = storage_block_saunders_simon_yip_bytes(m, n, p, k)
+          # actual_block_saunders_simon_yip_bytes = @allocated saunders_simon_yip(A, B, C, k; algo)
+          # @test expected_block_saunders_simon_yip_bytes ≤ actual_block_saunders_simon_yip_bytes ≤ 1.02 * expected_block_saunders_simon_yip_bytes
+        end
+      end
+
       @testset "Montoison-Orban" begin
         for reorthogonalization in (false, true)
           A, b = under_consistent(m, n, FC=FC)
@@ -145,6 +255,34 @@ end
           @test expected_montoison_orban_bytes ≤ actual_montoison_orban_bytes ≤ 1.02 * expected_montoison_orban_bytes
         end
       end
+
+      @testset "Block Montoison-Orban" begin
+        A = rand(FC, m, n)
+        B = rand(FC, n, m)
+        D = rand(FC, m, p)
+        C = rand(FC, n, p)
+
+        @testset "algo = $algo" for algo in ("gs", "mgs", "givens")
+          @testset "reorthogonalization = $reorthogonalization" for reorthogonalization in (false, true)
+            V, H, U, F = montoison_orban(A, B, D, C, k; algo, reorthogonalization)
+
+            @test norm(V[:,1:p*s]' * V[:,1:p*s] - I) ≤ 1e-4
+            @test norm(U[:,1:p*s]' * U[:,1:p*s] - I) ≤ 1e-4
+            @test A * U[:,1:k*p] ≈ V * H
+            @test B * V[:,1:k*p] ≈ U * F
+            @test B * A * U[:,1:(k-1)*p] ≈ U * F * H[1:k*p,1:(k-1)*p]
+            @test A * B * V[:,1:(k-1)*p] ≈ V * H * F[1:k*p,1:(k-1)*p]
+
+            # function storage_block_montoison_orban_bytes(m, n, p, k)
+            #   return ...
+            # end
+
+            # expected_block_montoison_orban_bytes = storage_block_montoison_orban_bytes(m, n, p, k)
+            # actual_block_montoison_orban_bytes = @allocated montoison_orban(A, B, D, C, k; algo, reorthogonalization)
+            # @test expected_block_montoison_orban_bytes ≤ actual_block_montoison_orban_bytes ≤ 1.02 * expected_block_montoison_orban_bytes
+          end
+        end
+      end
     end
   end
 end