OP-TEE · jforissier · Nov 20, 2017 · Nov 16, 2017 · Nov 16, 2017 · Nov 16, 2017
diff --git a/core/arch/arm/arm.mk b/core/arch/arm/arm.mk
@@ -52,7 +52,7 @@ endif
 
 core-platform-cppflags	+= -I$(arch-dir)/include
 core-platform-subdirs += \
-	$(addprefix $(arch-dir)/, kernel mm tee pta) $(platform-dir)
+	$(addprefix $(arch-dir)/, kernel crypto mm tee pta) $(platform-dir)
 
 ifneq ($(CFG_WITH_ARM_TRUSTED_FW),y)
 core-platform-subdirs += $(arch-dir)/sm

diff --git a/core/arch/arm/crypto/aes-gcm-ce.c b/core/arch/arm/crypto/aes-gcm-ce.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017, Linaro Limited
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <crypto/internal_aes-gcm.h>
+#include <crypto/ghash-ce-core.h>
+#include <io.h>
+#include <kernel/panic.h>
+#include <kernel/thread.h>
+#include <tomcrypt.h>
+#include <types_ext.h>
+
+static void get_be_block(void *dst, const void *src)
+{
+	uint64_t *d = dst;
+
+	d[1] = get_be64(src);
+	d[0] = get_be64((const uint8_t *)src + 8);
+}
+
+static void put_be_block(void *dst, const void *src)
+{
+	const uint64_t *s = src;
+
+	put_be64(dst, s[1]);
+	put_be64((uint8_t *)dst + 8, s[0]);
+}
+
+TEE_Result internal_aes_gcm_set_key(struct internal_aes_gcm_ctx *ctx,
+				    const void *key, size_t key_len)
+{
+	uint64_t k[2];
+	uint64_t a;
+	uint64_t b;
+
+	if (aes_setup(key, key_len, 0, &ctx->skey))
+		return TEE_ERROR_BAD_PARAMETERS;
+
+	internal_aes_gcm_encrypt_block(ctx, ctx->ctr, ctx->hash_subkey);
+
+	/* Store hash key in little endian and multiply by 'x' */
+	b = get_be64(ctx->hash_subkey);
+	a = get_be64(ctx->hash_subkey + 8);
+	k[0] = (a << 1) | (b >> 63);
+	k[1] = (b << 1) | (a >> 63);
+	if (b >> 63)
+		k[1] ^= 0xc200000000000000UL;
+
+	memcpy(ctx->hash_subkey, k, TEE_AES_BLOCK_SIZE);
+	return TEE_SUCCESS;
+}
+
+void internal_aes_gcm_ghash_update(struct internal_aes_gcm_ctx *ctx,
+				   const void *head, const void *data,
+				 size_t num_blocks)
+{
+	uint32_t vfp_state;
+	uint64_t dg[2];
+	uint64_t *k;
+
+	get_be_block(dg, ctx->hash_state);
+
+	k = (void *)ctx->hash_subkey;
+
+	vfp_state = thread_kernel_enable_vfp();
+
+#ifdef CFG_HWSUPP_PMULL
+	pmull_ghash_update_p64(num_blocks, dg, data, k, head);
+#else
+	pmull_ghash_update_p8(num_blocks, dg, data, k, head);
+#endif
+	thread_kernel_disable_vfp(vfp_state);
+
+	put_be_block(ctx->hash_state, dg);
+}
+
+#ifdef ARM64
+void internal_aes_gcm_encrypt_block(struct internal_aes_gcm_ctx *ctx,
+				    const void *src, void *dst)
+{
+	uint32_t vfp_state;
+	void *enc_key = ctx->skey.rijndael.eK;
+	size_t rounds = ctx->skey.rijndael.Nr;
+
+	vfp_state = thread_kernel_enable_vfp();
+
+	pmull_gcm_load_round_keys(enc_key, rounds);
+	pmull_gcm_encrypt_block(dst, src, rounds);
+
+	thread_kernel_disable_vfp(vfp_state);
+}
+
+void
+internal_aes_gcm_update_payload_block_aligned(struct internal_aes_gcm_ctx *ctx,
+					      TEE_OperationMode m,
+					      const void *src,
+					      size_t num_blocks, void *dst)
+{
+	uint32_t vfp_state;
+	uint64_t dg[2];
+	uint64_t ctr[2];
+	uint64_t *k;
+	void *enc_key = ctx->skey.rijndael.eK;
+	size_t rounds = ctx->skey.rijndael.Nr;
+
+	get_be_block(dg, ctx->hash_state);
+	get_be_block(ctr, ctx->ctr);
+
+	k = (void *)ctx->hash_subkey;
+
+	vfp_state = thread_kernel_enable_vfp();
+
+	pmull_gcm_load_round_keys(enc_key, rounds);
+
+	if (m == TEE_MODE_ENCRYPT)
+		pmull_gcm_encrypt(num_blocks, dg, dst, src, k, ctr, rounds,
+				  ctx->buf_cryp);
+	else
+		pmull_gcm_decrypt(num_blocks, dg, dst, src, k, ctr, rounds);
+
+	thread_kernel_disable_vfp(vfp_state);
+
+	put_be_block(ctx->ctr, ctr);
+	put_be_block(ctx->hash_state, dg);
+}
+#endif /*ARM64*/
diff --git a/core/arch/arm/crypto/ghash-ce-core_a32.S b/core/arch/arm/crypto/ghash-ce-core_a32.S
@@ -0,0 +1,251 @@
+/*
+ * Accelerated GHASH implementation with ARMv8 PMULL instructions.
+ *
+ * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <arm32_macros.S>
+
+#define ENTRY(func) \
+	.global func ; \
+	.type func , %function ; \
+	func :
+
+#define ENDPROC(func) \
+	.size func , .-func
+
+#define CPU_LE(x...)	x
+
+	SHASH		.req	q0
+	T1		.req	q1
+	XL		.req	q2
+	XM		.req	q3
+	XH		.req	q4
+	IN1		.req	q4
+
+	SHASH_L		.req	d0
+	SHASH_H		.req	d1
+	T1_L		.req	d2
+	T1_H		.req	d3
+	XL_L		.req	d4
+	XL_H		.req	d5
+	XM_L		.req	d6
+	XM_H		.req	d7
+	XH_L		.req	d8
+
+	t0l		.req	d10
+	t0h		.req	d11
+	t1l		.req	d12
+	t1h		.req	d13
+	t2l		.req	d14
+	t2h		.req	d15
+	t3l		.req	d16
+	t3h		.req	d17
+	t4l		.req	d18
+	t4h		.req	d19
+
+	t0q		.req	q5
+	t1q		.req	q6
+	t2q		.req	q7
+	t3q		.req	q8
+	t4q		.req	q9
+	T2		.req	q9
+
+	s1l		.req	d20
+	s1h		.req	d21
+	s2l		.req	d22
+	s2h		.req	d23
+	s3l		.req	d24
+	s3h		.req	d25
+	s4l		.req	d26
+	s4h		.req	d27
+
+	MASK		.req	d28
+	SHASH2_p8	.req	d28
+
+	k16		.req	d29
+	k32		.req	d30
+	k48		.req	d31
+	SHASH2_p64	.req	d31
+
+	.text
+	.fpu		crypto-neon-fp-armv8
+
+	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
+	vmull.p64	\rd, \rn, \rm
+	.endm
+
+	/*
+	 * This implementation of 64x64 -> 128 bit polynomial multiplication
+	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
+	 * "Fast Software Polynomial Multiplication on ARM Processors Using
+	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
+	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
+	 *
+	 * It has been slightly tweaked for in-order performance, and to allow
+	 * 'rq' to overlap with 'ad' or 'bd'.
+	 */
+	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
+	vext.8		t0l, \ad, \ad, #1	@ A1
+	.ifc		\b1, t4l
+	vext.8		t4l, \bd, \bd, #1	@ B1
+	.endif
+	vmull.p8	t0q, t0l, \bd		@ F = A1*B
+	vext.8		t1l, \ad, \ad, #2	@ A2
+	vmull.p8	t4q, \ad, \b1		@ E = A*B1
+	.ifc		\b2, t3l
+	vext.8		t3l, \bd, \bd, #2	@ B2
+	.endif
+	vmull.p8	t1q, t1l, \bd		@ H = A2*B
+	vext.8		t2l, \ad, \ad, #3	@ A3
+	vmull.p8	t3q, \ad, \b2		@ G = A*B2
+	veor		t0q, t0q, t4q		@ L = E + F
+	.ifc		\b3, t4l
+	vext.8		t4l, \bd, \bd, #3	@ B3
+	.endif
+	vmull.p8	t2q, t2l, \bd		@ J = A3*B
+	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
+	veor		t1q, t1q, t3q		@ M = G + H
+	.ifc		\b4, t3l
+	vext.8		t3l, \bd, \bd, #4	@ B4
+	.endif
+	vmull.p8	t4q, \ad, \b3		@ I = A*B3
+	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
+	vmull.p8	t3q, \ad, \b4		@ K = A*B4
+	vand		t0h, t0h, k48
+	vand		t1h, t1h, k32
+	veor		t2q, t2q, t4q		@ N = I + J
+	veor		t0l, t0l, t0h
+	veor		t1l, t1l, t1h
+	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
+	vand		t2h, t2h, k16
+	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	t3h, #0
+	vext.8		t0q, t0q, t0q, #15
+	veor		t2l, t2l, t2h
+	vext.8		t1q, t1q, t1q, #14
+	vmull.p8	\rq, \ad, \bd		@ D = A*B
+	vext.8		t2q, t2q, t2q, #13
+	vext.8		t3q, t3q, t3q, #12
+	veor		t0q, t0q, t1q
+	veor		t2q, t2q, t3q
+	veor		\rq, \rq, t0q
+	veor		\rq, \rq, t2q
+	.endm
+
+	//
+	// PMULL (64x64->128) based reduction for CPUs that can do
+	// it in a single instruction.
+	//
+	.macro		__pmull_reduce_p64
+	vmull.p64	T1, XL_L, MASK
+
+	veor		XH_L, XH_L, XM_H
+	vext.8		T1, T1, T1, #8
+	veor		XL_H, XL_H, XM_L
+	veor		T1, T1, XL
+
+	vmull.p64	XL, T1_H, MASK
+	.endm
+
+	//
+	// Alternative reduction for CPUs that lack support for the
+	// 64x64->128 PMULL instruction
+	//
+	.macro		__pmull_reduce_p8
+	veor		XL_H, XL_H, XM_L
+	veor		XH_L, XH_L, XM_H
+
+	vshl.i64	T1, XL, #57
+	vshl.i64	T2, XL, #62
+	veor		T1, T1, T2
+	vshl.i64	T2, XL, #63
+	veor		T1, T1, T2
+	veor		XL_H, XL_H, T1_L
+	veor		XH_L, XH_L, T1_H
+
+	vshr.u64	T1, XL, #1
+	veor		XH, XH, XL
+	veor		XL, XL, T1
+	vshr.u64	T1, T1, #6
+	vshr.u64	XL, XL, #1
+	.endm
+
+	.macro		ghash_update, pn
+	vld1.64		{XL}, [r1]
+
+	/* do the head block first, if supplied */
+	ldr		ip, [sp]
+	teq		ip, #0
+	beq		0f
+	vld1.64		{T1}, [ip]
+	teq		r0, #0
+	b		1f
+
+0:	vld1.64		{T1}, [r2]!
+	subs		r0, r0, #1
+
+1:	/* multiply XL by SHASH in GF(2^128) */
+#ifndef CONFIG_CPU_BIG_ENDIAN
+	vrev64.8	T1, T1
+#endif
+	vext.8		IN1, T1, T1, #8
+	veor		T1_L, T1_L, XL_H
+	veor		XL, XL, IN1
+
+	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
+	veor		T1, T1, XL
+	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
+	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
+
+	veor		T1, XL, XH
+	veor		XM, XM, T1
+
+	__pmull_reduce_\pn
+
+	veor		T1, T1, XH
+	veor		XL, XL, T1
+
+	bne		0b
+
+	vst1.64		{XL}, [r1]
+	bx		lr
+	.endm
+
+	/*
+	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+	 *			   struct ghash_key const *k, const char *head)
+	 */
+	.section .text.pmull_ghash_update_p64
+ENTRY(pmull_ghash_update_p64)
+	vld1.64		{SHASH}, [r3]
+	veor		SHASH2_p64, SHASH_L, SHASH_H
+
+	vmov.i8		MASK, #0xe1
+	vshl.u64	MASK, MASK, #57
+
+	ghash_update	p64
+ENDPROC(pmull_ghash_update_p64)
+
+	.section .text.pmull_ghash_update_p8
+ENTRY(pmull_ghash_update_p8)
+	vld1.64		{SHASH}, [r3]
+	veor		SHASH2_p8, SHASH_L, SHASH_H
+
+	vext.8		s1l, SHASH_L, SHASH_L, #1
+	vext.8		s2l, SHASH_L, SHASH_L, #2
+	vext.8		s3l, SHASH_L, SHASH_L, #3
+	vext.8		s4l, SHASH_L, SHASH_L, #4
+	vext.8		s1h, SHASH_H, SHASH_H, #1
+	vext.8		s2h, SHASH_H, SHASH_H, #2
+	vext.8		s3h, SHASH_H, SHASH_H, #3
+	vext.8		s4h, SHASH_H, SHASH_H, #4
+
+	vmov.i64	k16, #0xffff
+	vmov.i64	k32, #0xffffffff
+	vmov.i64	k48, #0xffffffffffff
+
+	ghash_update	p8
+ENDPROC(pmull_ghash_update_p8)