diff --git a/doc/vector-extra/Makefile b/doc/vector-extra/Makefile
new file mode 100644
index 00000000..3dd87daa
--- /dev/null
+++ b/doc/vector-extra/Makefile
@@ -0,0 +1,67 @@
+# Makefile for RISC-V Doc Template
+#
+# This work is licensed under the Creative Commons Attribution-ShareAlike 4.0
+# International License. To view a copy of this license, visit
+# http://creativecommons.org/licenses/by-sa/4.0/ or send a letter to
+# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
+#
+# SPDX-License-Identifier: CC-BY-SA-4.0
+#
+# Description:
+#
+# This Makefile is designed to automate the process of building and packaging
+# the Doc Template for RISC-V Extensions.
+
+DOCKER_RUN := docker run --rm -v ${PWD}:/build -w /build \
+riscvintl/riscv-docs-base-container-image:latest
+HEADER_SOURCE := riscv-crypto-spec-vector-extra.adoc
+PDF_RESULT := riscv-crypto-spec-vector-extra.pdf
+SPEC_COMMIT= git-commit.adoc
+ASCIIDOCTOR_PDF := asciidoctor-pdf
+OPTIONS := --trace \
+           -a compress \
+           -a mathematical-format=svg \
+           -a pdf-fontsdir=resources/fonts \
+           -a pdf-style=resources/themes/risc-v_spec-pdf.yml \
+           -a toc \
+           --failure-level=ERROR
+REQUIRES := --require=asciidoctor-bibtex \
+            --require=asciidoctor-diagram \
+            --require=asciidoctor-mathematical
+
+.PHONY: all build clean build-container build-no-container
+
+all: build
+
+cp_bib:
+	@cp ../riscv-crypto-spec.bib ./
+
+$(SPEC_COMMIT):
+	@git rev-parse --abbrev-ref HEAD > ${@}
+	@echo "@" >> ${@}
+	@git log --pretty=format:'%H' -n 1 >> ${@}
+
+build: cp_bib $(SPEC_COMMIT)
+	@echo "Checking if Docker is available..."
+	@if command -v docker &> /dev/null ; then \
+		echo "Docker is available, building inside Docker container..."; \
+		$(MAKE) build-container; \
+	else \
+		echo "Docker is not available, building without Docker..."; \
+		$(MAKE) build-no-container; \
+	fi
+
+build-container:
+	@echo "Starting build inside Docker container..."
+	$(DOCKER_RUN) /bin/sh -c "$(ASCIIDOCTOR_PDF) $(OPTIONS) $(REQUIRES) --out-file=$(PDF_RESULT) $(HEADER_SOURCE)"
+	@echo "Build completed successfully inside Docker container."
+
+build-no-container:
+	@echo "Starting build..."
+	$(ASCIIDOCTOR_PDF) $(OPTIONS) $(REQUIRES) --out-file=$(PDF_RESULT) $(HEADER_SOURCE)
+	@echo "Build completed successfully."
+
+clean:
+	@echo "Cleaning up generated files..."
+	rm -f $(PDF_RESULT)
+	@echo "Cleanup completed."
diff --git a/doc/vector-extra/images/risc-v_logo.png b/doc/vector-extra/images/risc-v_logo.png
new file mode 100644
index 00000000..d754746e
Binary files /dev/null and b/doc/vector-extra/images/risc-v_logo.png differ
diff --git a/doc/vector-extra/insns/vclmul-32e.adoc b/doc/vector-extra/insns/vclmul-32e.adoc
new file mode 100644
index 00000000..7a47de1c
--- /dev/null
+++ b/doc/vector-extra/insns/vclmul-32e.adoc
@@ -0,0 +1,104 @@
+[[insns-vclmul-32e, Vector Carry-less Multiply]]
+= vclmul.[vv,vx]
+
+Synopsis::
+Vector Carry-less Multiply by vector or scalar - returning low half of product.
+
+Mnemonic::
+vclmul.vv vd, vs2, vs1, vm +
+vclmul.vx vd, vs2, rs1, vm
+
+Encoding (Vector-Vector)::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 7, name: 'OP-V'},
+{bits: 5, name: 'vd'},
+{bits: 3, name: 'OPMVV'},
+{bits: 5, name: 'vs1'},
+{bits: 5, name: 'vs2'},
+{bits: 1, name: 'vm'},
+{bits: 6, name: '001100'},
+]}
+....
+
+Encoding (Vector-Scalar)::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 7, name: 'OP-V'},
+{bits: 5, name: 'vd'},
+{bits: 3, name: 'OPMVX'},
+{bits: 5, name: 'rs1'},
+{bits: 5, name: 'vs2'},
+{bits: 1, name: 'vm'},
+{bits: 6, name: '001100'},
+]}
+....
+Reserved Encodings::
+* `SEW` is any value other than 32 (`Zvbc32e` only)
+* `SEW` is any value other than 64 (`Zvbc` only)
+* `SEW` is any value other than 32 or 64 (`Zvbc` and `Zvbc32e`)
+
+Arguments::
+
+[%autowidth]
+[%header,cols="4,2,2"]
+|===
+|Register
+|Direction
+|Definition
+
+| `vs1`/`rs1` | input  | multiplier
+| `vs2`       | input  | multiplicand
+| `vd`        | output | lower part of carry-less multiply 
+|===
+
+[NOTE]
+====
+`vclmul` instruction was initially defined in `Zvbc` with only `SEW=64-bit` support, this page describes how the specification is extended in `Zvbc32e` to support `SEW=32` bits.
+====
+
+Description::
+Produces the low half of `2*SEW`-bit carry-less product.
+
+Each SEW-bit element in the `vs2` vector register is carry-less multiplied by
+either each SEW-bit element in `vs1` (vector-vector), or the SEW-bit value
+from integer register `rs1` (vector-scalar). The result is the least
+significant SEW bits of the carry-less product.
+
+[NOTE]
+====
+The 32-bit carryless multiply instructions can be used for implementing GCM in the absence of the `zvkg` extension.
+In particular for implementation with `ELEN=32` where `Zvkg` cannot be implemented. 
+It can also be used to speed-up CRC evaluation.
+====
+
+Operation::
+[source,sail]
+--
+
+
+function clause execute (VCLMUL(vs2, vs1, vd, suffix)) = {
+
+  foreach (i from vstart to vl-1) {
+    let op1 : bits (SEW) = if suffix =="vv" then get_velem(vs1, i)
+                          else zext_or_truncate_to_sew(X(vs1));
+    let op2 : bits (SEW) = get_velem(vs2, i);
+    let product : bits (SEW) = clmul(op1, op2, SEW);
+    set_velem(vd, i, product);
+  }
+  RETIRE_SUCCESS
+}
+
+function clmul(x, y, width) = {
+  let result : bits(width) = zeros();
+  foreach (i from 0 to (width - 1)) {
+    if y[i] == 1 then result = result ^ (x << i);
+  }
+  result
+}
+--
+
+Included in::
+<<zvbc32e>>
diff --git a/doc/vector-extra/insns/vclmulh-32e.adoc b/doc/vector-extra/insns/vclmulh-32e.adoc
new file mode 100644
index 00000000..e8fa6cbe
--- /dev/null
+++ b/doc/vector-extra/insns/vclmulh-32e.adoc
@@ -0,0 +1,99 @@
+[[insns-vclmulh-32e, Vector Carry-less Multiply Return High Half]]
+= vclmulh.[vv,vx]
+
+Synopsis::
+Vector Carry-less Multiply by vector or scalar - returning high half of product.
+
+Mnemonic::
+vclmulh.vv vd, vs2, vs1, vm +
+vclmulh.vx vd, vs2, rs1, vm
+
+Encoding (Vector-Vector)::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 7, name: 'OP-V'},
+{bits: 5, name: 'vd'},
+{bits: 3, name: 'OPMVV'},
+{bits: 5, name: 'vs1'},
+{bits: 5, name: 'vs2'},
+{bits: 1, name: 'vm'},
+{bits: 6, name: '001101'},
+]}
+....
+
+Encoding (Vector-Scalar)::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 7, name: 'OP-V'},
+{bits: 5, name: 'vd'},
+{bits: 3, name: 'OPMVX'},
+{bits: 5, name: 'rs1'},
+{bits: 5, name: 'vs2'},
+{bits: 1, name: 'vm'},
+{bits: 6, name: '001101'},
+]}
+....
+Reserved Encodings::
+* `SEW` is any value other than 64 (`Zvbc` only)
+* `SEW` is any value other than 32 (`Zvbc32e` only)
+* `SEW` is any value other than 32 or 64 (`Zvbc32e` and `Zvbc`)
+
+Arguments::
+
+[%autowidth]
+[%header,cols="4,2,2"]
+|===
+|Register
+|Direction
+|Definition
+
+| `vs1`/`rs1` | input  | multiplier
+| `vs2`       | input  | multiplicand
+| `vd`        | output | upper part of carry-less multiply 
+|===
+
+[NOTE]
+====
+`vclmulh` instruction was initially defined in `Zvbc`, this page describes how the specification is extended in `Zvbc32e` to support `SEW=32` bits.
+====
+
+Description::
+Produces the high half of `2*SEW`-bit carry-less product.
+
+Each SEW-bit element in the `vs2` vector register is carry-less multiplied by
+either each SEW-bit element in `vs1` (vector-vector), or the SEW-bit value
+from integer register `rs1` (vector-scalar). The result is the most
+significant SEW bits of the carry-less product.
+
+// This instruction must always be implemented such that its execution latency does not depend
+// on the data being operated upon.
+
+Operation::
+[source,sail]
+--
+function clause execute (VCLMULH(vs2, vs1, vd, suffix)) = {
+
+  foreach (i from vstart to vl-1) {
+    let op1 : bits (SEW) = if suffix =="vv" then get_velem(vs1,i)
+                          else zext_or_truncate_to_sew(X(vs1));
+    let op2 : bits (SEW) = get_velem(vs2, i);
+    let product : bits (SEW) = clmulh(op1, op2, SEW);
+    set_velem(vd, i, product);
+  }
+  RETIRE_SUCCESS
+}
+
+function clmulh(x, y, width) = {
+  let result : bits(width) = 0;
+  foreach (i from 1 to (width - 1)) {
+    if y[i] == 1 then result = result ^ (x >> (width - i));
+  }
+  result
+}
+
+--
+
+Included in::
+<<zvbc32e>>, Zvbc
diff --git a/doc/vector-extra/insns/vghsh-vs.adoc b/doc/vector-extra/insns/vghsh-vs.adoc
new file mode 100644
index 00000000..fcd9d533
--- /dev/null
+++ b/doc/vector-extra/insns/vghsh-vs.adoc
@@ -0,0 +1,136 @@
+[[insns-vghsh-vs, Vector-Scalar GHASH Add-Multiply]]
+= vghsh.vs
+
+Synopsis::
+Vector-Scalar Add-Multiply over GHASH Galois-Field
+
+Mnemonic::
+vghsh.vs vd, vs2, vs1
+
+
+// This might be the first instruction with 3 operands and .vs
+// need to find an encoding
+Encoding (Vector-Scalar)::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 7, name: 'OP-P'},
+{bits: 5, name: 'vd'},
+{bits: 3, name: 'OPMVV'},
+{bits: 5, name: 'vs1'},
+{bits: 5, name: 'vs2'},
+{bits: 1, name: '1'},
+{bits: 6, name: '101100'},
+]}
+....
+
+Reserved Encodings::
+* `SEW` is any value other than 32
+
+Arguments::
+
+[%autowidth]
+[%header,cols="4,2,2,2,2,2"]
+|===
+|Register
+|Direction
+|EGW
+|EGS
+|SEW
+|Definition
+
+| `vd`  | input  | 128  | 4 | 32 | Partial hash (Y~i~)
+| `vs1` | input  | 128  | 4 | 32 | Cipher text (X~i~)
+| `vs2` | input  | 128  | 4 | 32 | Hash Subkey (H)
+| `vd`  | output | 128  | 4 | 32 | Partial-hash (Y~i+1~)
+|===
+
+Description::
+A single "iteration" of the GHASH~H~ algorithm is performed.
+
+
+The previous partial hashes are read as 4-element groups from `vd`,
+the cipher texts are read as 4-element groups from `vs1`
+ and the hash subkeys are read from the scalar element group in `vs2`.
+The resulting partial hashes are writen as 4-element groups into `vd`.
+
+
+// The following is copied from vghsh.vv and could be omitted
+// (replaced with a link to the original specification)
+
+This instruction treats all of the input and output element groups as 128-bit polynomials and
+performs operations over GF[2].
+It produces the next partial hash (Y~i+1~) by adding the current partial
+hash (Y~i~) to the cipher text block (X~i~) and then multiplying (over GF(2^128^))
+this sum by the Hash Subkey (H).
+
+The multiplication over GF(2^128^) is a carryless multiply of two 128-bit polynomials
+modulo GHASH's irreducible polynomial (x^128^ + x^7^ + x^2^ + x + 1).
+
+The operation can be compactly defined as
+// Y~i+1~ = (Y~i~ &#183; H) ^ X~i~
+Y~i+1~ = ((Y~i~ ^ X~i~) &#183; H)
+
+The NIST specification (see <<zvkg>>) orders the coefficients from left to right x~0~x~1~x~2~...x~127~
+for a polynomial x~0~ + x~1~u +x~2~ u^2^ + ... + x~127~u^127^. This can be viewed as a collection of
+byte elements in memory with the byte containing the lowest coefficients (i.e., 0,1,2,3,4,5,6,7)
+residing at the lowest memory address. Since the bits in the bytes are reversed,
+This instruction internally performs bit swaps within bytes to put the bits in the standard ordering
+(e.g., 7,6,5,4,3,2,1,0).
+
+This instruction must always be implemented such that its execution latency does not depend
+on the data being operated upon.
+
+[NOTE]
+====
+We are bit-reversing the bytes of inputs and outputs so that the intermediate values are consistent
+with the NIST specification. These reversals are inexpensive to implement as they unconditionally
+swap bit positions and therefore do not require any logic.
+====
+
+
+Operation::
+[source,pseudocode]
+--
+function clause execute (VGHSHVS(vs2, vs1, vd)) = {
+  // operands are input with bits reversed in each byte
+  if(LMUL*VLEN < EGW)  then {
+    handle_illegal();  // illegal instruction exception
+    RETIRE_FAIL
+  } else {
+
+  eg_len = (vl/EGS)
+  eg_start = (vstart/EGS)
+
+  // H is common to all element groups
+  let helem = 0;
+  let H = brev8(get_velem(vs2, EGW=128, helem)); // Hash subkey
+
+  foreach (i from eg_start to eg_len-1) {
+    let Y = get_velem(vd,EGW=128,i);  // current partial-hash
+    let X = get_velem(vs1,EGW=128,i);  // block cipher output
+
+    let Z : bits(128) = 0;
+
+    let S = brev8(Y ^ X);
+
+    for (int bit = 0; bit < 128; bit++) {
+      if bit_to_bool(S[bit])
+        Z ^= H
+
+      bool reduce = bit_to_bool(H[127]);
+      H = H << 1; // left shift H by 1
+      if (reduce)
+        H ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+    }
+
+    let result = brev8(Z); // bit reverse bytes to get back to GCM standard ordering
+    set_velem(vd, EGW=128, i, result);
+  }
+  RETIRE_SUCCESS
+ }
+}
+--
+
+Included in::
+<<zvkgs>>
diff --git a/doc/vector-extra/insns/vgmul-vs.adoc b/doc/vector-extra/insns/vgmul-vs.adoc
new file mode 100644
index 00000000..622badd1
--- /dev/null
+++ b/doc/vector-extra/insns/vgmul-vs.adoc
@@ -0,0 +1,129 @@
+[[insns-vgmul-vs, Vector GHASH Multiply]]
+= vgmul.vs
+
+Synopsis::
+Vector-Scalar Multiply over GHASH Galois-Field
+
+Mnemonic::
+vgmul.vs vd, vs2
+
+
+Encoding (Vector-Scalar)::
+[wavedrom, , svg]
+....
+{reg:[
+{bits: 7, name: 'OP-P'},
+{bits: 5, name: 'vd'},
+{bits: 3, name: 'OPMVV'},
+{bits: 5, name: '10001'},
+{bits: 5, name: 'vs2'},
+{bits: 1, name: '1'},
+{bits: 6, name: '101001'},
+]}
+....
+
+Reserved Encodings::
+* `SEW` is any value other than 32
+
+Arguments::
+
+[%autowidth]
+[%header,cols="4,2,2,2,2,2"]
+|===
+|Register
+|Direction
+|EGW
+|EGS
+|SEW
+|Definition
+
+| `vd`  | input  | 128  | 4 | 32 | Multiplier
+| `vs2` | input  | 128  | 4 | 32 | Multiplicand
+| `vd`  | output | 128  | 4 | 32 | Product
+|===
+
+Description::
+A GHASH~H~ multiply is performed.
+
+The multipliers are read as 4-element groups from `vd`,
+ the multiplicands subkeys are read from the scalar element group in `vs2`.
+The resulting products are written as 4-element groups into `vd`.
+
+This instruction treats all of the inputs and outputs as 128-bit polynomials and
+performs operations over GF[2].
+It produces the product over GF(2^128^) of the two 128-bit inputs.
+
+The multiplication over GF(2^128^) is a carryless multiply of two 128-bit polynomials
+modulo GHASH's irreducible polynomial (x^128^ + x^7^ + x^2^ + x + 1).
+
+The NIST specification (see <<zvkg>>) orders the coefficients from left to right x~0~x~1~x~2~...x~127~
+for a polynomial x~0~ + x~1~u +x~2~ u^2^ + ... + x~127~u^127^. This can be viewed as a collection of
+byte elements in memory with the byte containing the lowest coefficients (i.e., 0,1,2,3,4,5,6,7)
+residing at the lowest memory address. Since the bits in the bytes are reversed, 
+This instruction internally performs bit swaps within bytes to put the bits in the standard ordering
+(e.g., 7,6,5,4,3,2,1,0).
+
+This instruction must always be implemented such that its execution latency does not depend
+on the data being operated upon.
+
+[NOTE]
+====
+We are bit-reversing the bytes of inputs and outputs so that the intermediate values are consistent
+with the NIST specification. These reversals are inexpensive to implement as they unconditionally
+swap bit positions and therefore do not require any logic.
+====
+
+
+[NOTE]
+====
+Similarly to how the instruction `vgmul.vv` is identical to `vghsh.vv` with the value
+of vs1 register being 0, the instruction `vgmul.vs` is identical to `vghsh.vs` with the value of vs1 being 0.
+This instruction is often used in GHASH code. In some cases it is followed
+by an XOR to perform a multiply-add. Implementations may choose to fuse these
+two instructions to improve performance on GHASH code that
+doesn't use the add-multiply form of the `vghsh.vv` instruction.
+
+====
+
+
+Operation::
+[source,pseudocode]
+--
+function clause execute (VGMUL(vs2, vs1, vd, suffix)) = {
+  // operands are input with bits reversed in each byte
+  if(LMUL*VLEN < EGW)  then {
+    handle_illegal();  // illegal instruction exception
+    RETIRE_FAIL
+  } else {
+
+  eg_len = (vl/EGS)
+  eg_start = (vstart/EGS)
+  // H multiplicand is common for all loop iterations
+  let helem = 0;
+  let H = brev8(get_velem(vs2,EGW=128, helem)); // Multiplicand
+
+  foreach (i from eg_start to eg_len-1) {
+    let Y = brev8(get_velem(vd,EGW=128,i));  // Multiplier
+    let Z : bits(128) = 0;
+
+    for (int bit = 0; bit < 128; bit++) {
+      if bit_to_bool(Y[bit])
+        Z ^= H
+
+      bool reduce = bit_to_bool(H[127]);
+      H = H << 1; // left shift H by 1
+      if (reduce)
+        H ^= 0x87; // Reduce using x^7 + x^2 + x^1 + 1 polynomial
+    }
+
+
+    let result = brev8(Z);
+    set_velem(vd, EGW=128, i, result);
+  }
+  RETIRE_SUCCESS
+ }
+}
+--
+
+Included in::
+<<zvkgs>>
diff --git a/doc/vector-extra/riscv-crypto-spec-vector-extra.adoc b/doc/vector-extra/riscv-crypto-spec-vector-extra.adoc
new file mode 100644
index 00000000..33fec430
--- /dev/null
+++ b/doc/vector-extra/riscv-crypto-spec-vector-extra.adoc
@@ -0,0 +1,166 @@
+[[riscv-doc-template]]
+= RISC-V Cryptography Extensions Volume III: Additional Vector Instructions
+:description: The addtional vector cryptography extensions for the RISC-V ISA.
+:company: RISC-V.org
+:revdate: March 7th 2024
+:revnumber: v0.0.5
+:revremark:
+:url-riscv: http://riscv.org
+:doctype: book
+//:doctype: report
+:preface-title: Preamble
+:colophon:
+:appendix-caption: Appendix
+:imagesdir: images
+:title-logo-image: image:risc-v_logo.png[pdfwidth=3.25in,align=center]
+//:page-background-image: image:draft.svg[opacity=20%]
+//:title-page-background-image: none
+//:back-cover-image: image:circuit.png[opacity=25%]
+// Settings:
+:experimental:
+:reproducible:
+// needs to be changed? bug discussion started
+:WaveDromEditorApp: wavedrom-cli
+:imagesoutdir: images
+:icons: font
+:lang: en
+:listing-caption: Listing
+:sectnums:
+:toc: left
+:toclevels: 4
+:source-highlighter: pygments
+ifdef::backend-pdf[]
+:source-highlighter: coderay
+endif::[]
+:data-uri:
+:hide-uri-scheme:
+:stem: latexmath
+:footnote:
+:xrefstyle: short
+:bibtex-file: riscv-crypto-spec.bib
+:bibtex-order: alphabetical
+:bibtex-style: ieee
+
+//:This is the preamble.
+
+[colophon]
+= Colophon
+
+This document describes additional Vector Cryptography extensions to the
+RISC-V Instruction Set Architecture.
+
+This document is _Discussion Document_.
+Assume everything can change.
+This document is not complete yet and was created only for the purpose of conversation outside of the document.
+For more information, see link:http://riscv.org/spec-state[here].
+
+[NOTE]
+.Copyright and licensure:
+This work is licensed under a
+link:http://creativecommons.org/licenses/by/4.0/[Creative Commons Attribution 4.0 International License]
+
+[NOTE]
+.Document Version Information:
+====
+include::git-commit.adoc[]
+
+See link:https://github.com/riscv/riscv-crypto/doc/vector-extra[github.com/riscv/riscv-crypto/doc/vector-extra]
+for more information.
+====
+
+[acknowledgments]
+== Acknowledgments
+
+Contributors to this specification (in alphabetical order)
+include: +
+Eric Biggers,
+Ken Dockser,
+Liana Koleva,
+Markku-Juhani O. Saarinen,
+Nicolas Brunie,
+Richard Newell
+
+We are all very grateful to the many other people who have
+helped to improve this specification through their comments, reviews,
+feedback and questions.
+
+// ------------------------------------------------------------
+
+include::riscv-crypto-vector-extra-introduction.adoc[]
+
+// ------------------------------------------------------------
+
+<<<
+// ------------------------------------------------------------
+
+
+[[crypto_vector_extensions]]
+== Extensions Overview
+
+The section introduces all of the extensions in the Additional Vector Cryptography 
+Instruction Set Extension Specification.
+
+
+All the Additional Vector Crypto Extensions can be built
+on _any_ embedded (Zve*) or application ("V") base Vector Extension.
+In particular `Zvbc32e` allows `Zve32*` implementations to support vector carry-less multiplication.
+
+// See <<crypto-vector-element-groups>> for more details on vector element groups and the drawbacks of
+// small `VLEN` values.
+
+
+As the instructions defined in this specification might be used to implement cryptographic primitives
+ they may be implemented with data-independent execution latencies as
+defined in the
+link:https://github.com/riscv/riscv-crypto/releases/tag/v1.0.1-scalar[RISC-V Scalar Cryptography Extensions specification].
+
+If `Zvkt` is implemented, all the instructions from `Zvbc32e` (`vclmul[h].[vv,vx]`) 
+shall be executed with data-independent execution latency.
+
+Whether `Zvkt` is implemented or not, all instructions from `Zvkgs` (`vgmul.vs`, `vghsh.vs`)
+shall be executed with data-independent execution latency.
+
+
+Detection of individual cryptography extensions uses the
+unified software-based RISC-V discovery method.
+
+[NOTE]
+====
+At the time of writing, these discovery mechanisms are still a work in
+progress.
+====
+
+include::./riscv-crypto-vector-extra-zvbc32e.adoc[]
+<<<
+include::./riscv-crypto-vector-extra-zvkgs.adoc[]
+<<<
+
+
+
+// ------------------------------------------------------------
+
+[[crypto_vector_extra_insns, reftext="Additional Vector Cryptography Instructions"]]
+== Instructions
+
+
+include::insns/vclmul-32e.adoc[leveloffset=+2]
+<<<
+include::insns/vclmulh-32e.adoc[leveloffset=+2]
+<<<
+include::insns/vghsh-vs.adoc[leveloffset=+2]
+<<<
+include::insns/vgmul-vs.adoc[leveloffset=+2]
+<<<
+
+[[bibliography]]
+== Bibliography
+
+bibliography::../riscv-crypto-spec.bib[ieee]
+
+[[Encodings]]
+== Encodings
+include::./riscv-crypto-vector-extra-inst-table-zvbc32e.adoc[]
+
+include::./riscv-crypto-vector-extra-inst-table.adoc[]
+
+
diff --git a/doc/vector-extra/riscv-crypto-vector-extra-inst-table.adoc b/doc/vector-extra/riscv-crypto-vector-extra-inst-table.adoc
new file mode 100644
index 00000000..b1439419
--- /dev/null
+++ b/doc/vector-extra/riscv-crypto-vector-extra-inst-table.adoc
@@ -0,0 +1,59 @@
+[appendix]
+[[crypto_vector_instructions_Zvkgs]]
+=== Additional Vector Cryptographic Instructions
+
+OP-P (0x77)
+Vector Crypto instructions, including `Zvkgs`, except `Zvbb` and `Zvbc`.
+The new/modified encodings are in bold.
+
+// [cols="4,1,1,1,8,4,1,1,8,4,1,1,8"]
+[cols="4,1,1,1,1,4,1,1,1,4,1,1,1"]
+|===
+5+^|Integer 4+^|Integer 4+^| FP
+
+| funct3 | | | |            | funct3 | | |             | funct3 | | |
+| OPIVV  |V| | |            | OPMVV  |V| |             | OPFVV  |V| |
+| OPIVX  | |X| |            | OPMVX  | |X|             | OPFVF  | |F|
+| OPIVI  | | |I|            |        | | |             |        | | |
+|===
+
+// [cols="4,1,1,1,8,4,1,1,8,4,1,1,8"]
+[cols="6,1,1,1,1,6,1,1,6,6,1,1,1"]
+
+|===
+5+^| funct6                  4+^| funct6                 4+^| funct6
+
+|100000  | | | |            | 100000 |V| | vsm3me       | 100000 | | |
+| 100001 | | | |            | 100001 |V| | vsm4k.vi     | 100001 | | |
+| 100010 | | | |            | 100010 |V| | vaesfk1.vi   | 100010 | | |
+| 100011 | | | |            | 100011 |V| | __**vghsh.vs**__ | 100011 | | |
+| 100100 | | | |            | 100100 | | |              | 100100 | | |
+| 100101 | | | |            | 100101 | | |              | 100101 | | |
+| 100110 | | | |            | 100110 | | |              | 100110 | | |
+| 100111 | | | |            | 100111 | | |              | 100111 | | |
+|        | | | |            |        | | |              |        | | |
+| 101000 | | | |            | 101000 |V| | VAES.vv    | 101000 | | |
+| 101001 | | | |            | 101001 |V| | *VAES.vs*    | 101001 | | |
+| 101010 | | | |            | 101010 |V| | vaesfk2.vi   | 101010 | | |
+| 101011 | | | |            | 101011 |V| | vsm3c.vi     | 101011 | | |
+| 101100 | | | |            | 101100 |V| | vghsh        | 101100 | | |
+| 101101 | | | |            | 101101 |V| | vsha2ms      | 101101 | | |
+| 101110 | | | |            | 101110 |V| | vsha2ch      | 101110 | | |
+| 101111 | | | |            | 101111 |V| | vsha2cl      | 101111 | | |
+|===
+
+<<<
+
+.VAES.vv and VAES.vs encoding space
+[cols="2,14"]
+|===
+|vs1|
+
+| 00000 | vaesdm
+| 00001 | vaesdf
+| 00010 | vaesem
+| 00011 | vaesef
+| 00111 | vaesz
+| 10000 | vsm4r
+| 10001 | __**vgmul**__
+|===
diff --git a/doc/vector-extra/riscv-crypto-vector-extra-introduction.adoc b/doc/vector-extra/riscv-crypto-vector-extra-introduction.adoc
new file mode 100644
index 00000000..6a516729
--- /dev/null
+++ b/doc/vector-extra/riscv-crypto-vector-extra-introduction.adoc
@@ -0,0 +1,11 @@
+[[crypto_vector_introduction]]
+== Introduction
+
+This document describes the proposed _additional_ _vector_ cryptography
+extensions for RISC-V.
+Those extensions extend the _vector_ cryptography extensions for RISC-V,
+providing additional features.
+Those extensions aim at either enabling some use cases (e.g. carry-less multiply on 32-bit vector implementations)
+or enabling more efficient implementations of some algorithms (e.g. CRC, AES-GCM).
+All instructions proposed here are based on the Vector registers.
+
diff --git a/doc/vector-extra/riscv-crypto-vector-extra-zvbc32e.adoc b/doc/vector-extra/riscv-crypto-vector-extra-zvbc32e.adoc
new file mode 100644
index 00000000..9cf42177
--- /dev/null
+++ b/doc/vector-extra/riscv-crypto-vector-extra-zvbc32e.adoc
@@ -0,0 +1,23 @@
+[[zvbc32e,Zvbc32e]]
+=== `Zvbc32e` - Vector Carryless Multiplication
+
+General purpose carryless multiplication instructions which are commonly used in cryptography
+and hashing (e.g., Elliptic curve cryptography, GHASH, CRC).
+
+These instructions are only defined for `SEW`=32.
+Zvbc32e can be supported when `ELEN >=32`.
+
+
+Note:: The extension `Zvbc32e` is independent from `Zvbc` which defines the same instructions for `SEW=64`.
+       When `ELEN>=64` both extensions can be combined to have `vclmul.v[vx]` and `vclmulh.v[vx]` defined for both `SEW=32` and `SEW=64`.
+
+[%autowidth]
+[%header,cols="^2,4"]
+|===
+|Mnemonic
+|Instruction
+| `vclmul.[vv,vx]`     | <<insns-vclmul-32e>>
+| `vclmulh.[vv,vx]`    | <<insns-vclmulh-32e>>
+
+|===
+
diff --git a/doc/vector-extra/riscv-crypto-vector-extra-zvkgs.adoc b/doc/vector-extra/riscv-crypto-vector-extra-zvkgs.adoc
new file mode 100644
index 00000000..99155dc5
--- /dev/null
+++ b/doc/vector-extra/riscv-crypto-vector-extra-zvkgs.adoc
@@ -0,0 +1,36 @@
+[[zvkgs,Zvkgs]]
+=== `Zvkgs` - Vector-Scalar GCM/GMAC
+
+Instructions to enable the efficient implementation of parallel versions of GHASH~H~ which is used in Galois/Counter Mode (GCM) and
+Galois Message Authentication Code (GMAC).
+
+`Zvkgs` depends on `Zvkg`. It extends the existing `vghsh.vv` and `vgmul.vv` instructions with new vector-scalar variants: `vghsh.vs` and `vgmul.vs`.
+
+The instructions inherit the constraints defined in `Zvkg`:
+
+- element group size (EGS) is 4
+- data independent execution timing
+- `vl`/`vstart` must be multiples of EGS=4
+
+All of these instructions work on 128-bit element groups comprised of four 32-bit elements, in element group parlance `EGS=4`, `EGW=128` and the instructions are only defined for `SEW=32`.
+
+To help avoid side-channel timing attacks, these instructions shall always be implemented with data-independent timing.
+
+The number of element groups to be processed is `vl`/`EGS`.
+`vl` must be set to the number of `SEW=32` elements to be processed and
+therefore must be a multiple of `EGS=4`. +
+Likewise, `vstart` must be a multiple of `EGS=4`.
+
+[%autowidth]
+[%header,cols="^2,4,4,4"]
+|===
+
+|SEW
+|EGW
+|Mnemonic
+|Instruction
+| 32 | 128 | `vghsh.vs` | <<insns-vghsh-vs>>
+| 32 | 128 | `vgmul.vs` | <<insns-vgmul-vs>>
+
+|===
+