diff --git a/Makefile b/Makefile
index 7a69ad1b3c14f..beac4757067a6 100644
--- a/Makefile
+++ b/Makefile
@@ -93,6 +93,8 @@ CC	:= riscv64-unknown-linux-gnu-gcc
 CXX	:= riscv64-unknown-linux-gnu-g++
 endif
 
+K1OM := $(shell echo | $(CC) -dM -E - | grep __k1om__)
+
 #
 # Compile flags
 #
@@ -279,6 +281,10 @@ endif
 ifndef RISCV
 
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
+
+# detect the PHI cross compiler.
+ifeq "${K1OM}" ""
+
 	# Use all CPU extensions that are available:
 	MK_CFLAGS     += -march=native -mtune=native
 	HOST_CXXFLAGS += -march=native -mtune=native
@@ -290,6 +296,11 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Usage SSSE3-only (Not is SSE3!)
 	#MK_CFLAGS   += -mssse3
 	#MK_CXXFLAGS += -mssse3
+else
+	OBJS         += ggml-phi-knc.o ggml-phi-knc-dot_q5_K_q8_K.o
+	MK_CFLAGS    += -march=knc -mtune=knc
+endif
+
 endif
 
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
@@ -733,6 +744,9 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
 
+# Helper function that replaces .c, .cpp, and .cu file endings with .s:
+GET_ASM_FILE = $(patsubst %.c,%.s,$(patsubst %.cpp,%.s,$(patsubst %.cu,%.s,$(1))))
+
 main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -740,6 +754,19 @@ main: examples/main/main.cpp                                  ggml.o llama.o $(C
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
+bench-phi-knc.s: bench-phi-knc.c
+	$(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
+
+ggml-phi-knc.s: ggml-phi-knc.c
+	$(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
+
+bench-phi-knc: bench-phi-knc.c ggml-phi-knc.o ggml-phi-knc-dot_q5_K_q8_K.o
+	$(CC) $(CFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CC) $(CFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+ggml-phi-knc-dot_q5_K_q8_K.s: ggml-phi-knc-dot_q5_K_q8_K.c
+	$(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
+
 infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/bench-phi-knc.c b/bench-phi-knc.c
new file mode 100644
index 0000000000000..ca3acc71df211
--- /dev/null
+++ b/bench-phi-knc.c
@@ -0,0 +1,213 @@
+/* bench-phi-knc.c: benchmarks and tests for the Xeon PHI Knights Corner optimizations. */
+
+#include <immintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+/* For CLOCK_REALTIME? */
+#include <unistd.h>
+#include <time.h>
+
+/* For memcpy */
+#include <string.h>
+
+/* include the increasingly inacurately named header for our F32 dot product code. */
+#include "ggml-phi-knc.h"
+
+/* include the header for our Q8K_Q5K dot product code. */
+#include "ggml-phi-knc-dot_q5_K_q8_K.h"
+
+// largest Float32 vectors to get the dot product of.
+#define F32_MAXVEC 1024768
+// how many benchmarks we will run in total.
+#define F32_RUNCOUNT 12
+#define F32_ITEMS_PER_RUN {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768}
+
+int main(void)
+{
+  int vecRuns[F32_RUNCOUNT] = F32_ITEMS_PER_RUN;
+
+  // seed the random number generator.
+  srand(time(NULL));
+
+  // Run benchmarks for our F32 dot product functions. Benchmark them against a naieve implementation.
+  for (uint8_t runCount = 0; runCount < F32_RUNCOUNT; ++runCount)
+    {
+      struct timespec start, middle, end;
+      double vector_time;
+      double scalar_time;
+      float scalar = 0.0f;
+      float vector = 0.0f;
+
+      // Generate random input vector of [-1, 1] values.
+      float vec1[F32_MAXVEC] __attribute__((aligned(64)));
+      for (int i = 0; i < vecRuns[runCount]; i++)
+        vec1[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
+
+      // Generate a second random input vector of [-1, 1] values.
+      float vec2[F32_MAXVEC] __attribute__((aligned(64)));
+      for (int i = 0; i < vecRuns[runCount]; i++)
+        vec2[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
+
+      // on your mark..
+      clock_gettime(CLOCK_MONOTONIC, &start);
+
+      // call dot product
+      ggml_vec_dot_f32(vecRuns[runCount], &vector, 0, vec1, 0, vec2, 0, 0);
+
+      // save the middle point..
+      clock_gettime(CLOCK_MONOTONIC, &middle);
+
+      // do the same work by hand;
+      for (int i = 0; i < vecRuns[runCount]; ++i)
+        scalar += vec1[i]*vec2[i];
+
+      clock_gettime(CLOCK_MONOTONIC, &end);
+
+      printf("vector\tvs\tscalar (%d items)\n", vecRuns[runCount]);
+      printf("%.9f\tvs\t%.9f\n", vector, scalar);
+
+      vector_time = middle.tv_sec - start.tv_sec;
+      vector_time += (middle.tv_nsec - start.tv_nsec) / 1000000000.0;
+
+      scalar_time = end.tv_sec - middle.tv_sec;
+      scalar_time += (end.tv_nsec - middle.tv_nsec) / 1000000000.0;
+
+      printf("%.9f\tvs\t%.9f\n", vector_time, scalar_time);
+    }
+
+  fflush(stdout);
+
+  // Generate a random input vector of 256 4 bit values.
+  uint8x16_t q4[8];
+  uint8_t * q4ptr = (uint8_t *)q4;
+  for (int i = 0; i < 128; i++)
+    q4ptr[i] = rand() && 0xFF;
+
+  // Generate a random input vector of 256 1 bit values.
+  uint8x16_t q1[2];
+  uint8_t * q1ptr = (uint8_t *)q1;
+  for (int i = 0; i < 32; i++)
+    q1ptr[i] = rand() && 0xFF;
+
+  // Get our reference, unshifted result.
+  uint8x16_t q5[16];
+  GGML_5bit_Unpack_Unaligned(q4, (uint8_t *)q1, q5);
+
+  printf("successfully got a Q5.\n");
+
+  // Perform alignment tests, for GGML_5bit_Unpack_Unaligned.
+  // Try to run GGML_5bit_Unpack_Unaligned with all possible misalignments, and get it to fail.
+  for (uint8_t shiftCount = 1; shiftCount < 16; ++shiftCount)
+    {
+      uint8x16_t q5new[16];
+      uint8x16_t q4Shifted[9];
+
+      // create an off-by-shiftCount copy of q4.
+      q4ptr = ((uint8_t *)q4Shifted) + shiftCount;
+      memcpy (q4ptr, q4, 128);
+
+      // call the unaligned form of this function:
+      GGML_5bit_Unpack_Unaligned((uint8x16_t *)q4ptr, (uint8_t *)q1, q5new);
+
+      for (uint32_t byteCount = 0; byteCount < 256; ++byteCount)
+       {
+         if ( ((uint8_t *)q5new)[byteCount] != ((uint8_t *)q5)[byteCount] )
+           {
+             printf("whoops!\nshiftCount: %d\nbyteCount: %d\n", shiftCount, byteCount);
+             exit (-1);
+           }
+       }
+
+      printf("Got a Q5 offset by %d\n", shiftCount);
+    }
+
+  // Generate a random input vector of 256 8 bit values.
+  int8x16_t q8[16];
+  int8_t * q8ptr = (int8_t *)q8;
+  for (int i = 0; i < 256; i++)
+    q8ptr[i] = rand() && 0xFF;
+
+  // Generate eight random scales, one for each pair of sums.
+  uint8_t scale[8];
+  for (int i = 0; i < 8; i++)
+    scale[i] = rand() && 0xFF;
+
+  // Generate a random X scale.
+  float rndScaleX = 2 * (0.5 - rand() / (float)RAND_MAX);
+  ggml_fp16_t scaleX = GGML_PHI_FP32_TO_FP16(rndScaleX);
+
+  // Display the random X scale. Verifies FP32_TO_FP16_TO_FP32 is working.
+  printf("rndScaleX: %f\n", rndScaleX);
+  printf("scaleX: %x\n", scaleX);
+  printf("newScaleX: %f\n", GGML_PHI_FP16_TO_FP32(scaleX));
+
+  // Generate a random Y scale.
+  float scaleY = 2 * (0.5 - rand() / (float)RAND_MAX);
+  printf("scaleY: %f\n", scaleY);
+
+  // Create a place for our golden result.
+  float32x16_t res;
+
+  // Clear res.
+  GGML_F32x16_VEC_ZERO(&res);
+
+  // Generate an initial result, to compare to.
+  GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (q8, q5, scale, scaleX, scaleY, &res);
+
+  // Generate a sum of the result.
+  float sum = 0.0f;
+  for (int l = 0; l < 16; ++l) sum += ((float *)&res)[l];
+
+  printf("Got a res: %f\n", sum);
+
+  // Perform alignment tests, for GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned.
+  // try to run GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned with all possible mis-alignments, and get it to fail.
+  for (uint8_t shiftCount = 1; shiftCount < 16; ++shiftCount)
+    {
+      float32x16_t resNew1;
+      int8x16_t q8Shifted[17];
+
+      // Create an off-by-shiftCount copy of q8.
+      q8ptr = ((int8_t *)q8Shifted)+shiftCount;
+      memcpy (q8ptr, q8, 256);
+
+      // Clear resNew.
+      GGML_F32x16_VEC_ZERO(&resNew1);
+
+      // Call the unaligned form of this function:
+      GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned ((int8x16_t *)q8ptr, q5, scale, scaleX, scaleY, &resNew1);
+
+      // check the result against our reference.
+      for (uint32_t floatCount = 0; floatCount < 64; ++floatCount)
+       {
+         if ( ((int8_t *)&resNew1)[floatCount] != ((int8_t *)&res)[floatCount] )
+           {
+             printf("whoops!\nshiftCount: %d\nfloatCount: %d\n", shiftCount, floatCount);
+             for (uint32_t row = 0; row < 16 ; ++row)
+               {
+                 for (int col1 = 0; col1 < 4; ++col1)
+                   {
+                     printf("%2.2x\t", ((int8_t *)&resNew1)[(4*row)+col1]);
+                   }
+                 printf(" vs ");
+                 for (int col2 = 0; col2 < 4; ++col2)
+                   {
+                     printf("%2.2x\t", ((int8_t *)&res)[(4*row)+col2]);
+                   }
+                 printf ("\n");
+               }
+             exit (-1);
+           }
+       }
+
+      // Generate a sum of our new result.
+      float sumf = 0.0f;
+      for (int l = 0; l < 16; ++l) sumf += ((float *)&resNew1)[l];
+
+      printf("Got a res from a Q8 offset by %d: %f\n", ((uint64_t) q8ptr) & 0x3F, sumf);
+    }
+
+  return 0;
+}
diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
new file mode 100644
index 0000000000000..754366185fdcc
--- /dev/null
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -0,0 +1,392 @@
+/* Xeon PHI IMCI support. */
+/* formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
+/* formatted by using emacs, with (M-x set-variable RET indent-tabs-mode RET nil RET) executed. */
+
+// For uint32_t
+#include <stdint.h>
+
+// For size_t
+#include <stdio.h>
+
+// Yes, we have to tell this header to actually export stuff.
+#define GGML_COMMON_IMPL_C
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+
+// For block_q5_K and block_q8_K.
+#include "ggml-common.h"
+
+// For our vector types, and forward declarations.
+#include "ggml-phi-knc-dot_q5_K_q8_K.h"
+
+// We can fit 16 float32s in a single vector register.
+#define GGML_F32_EPR 16
+
+/* Clear a vector of 16 floats. */
+void GGML_F32x16_VEC_ZERO(float32x16_t *target)
+{
+    uint8_t zero=0;
+
+    __asm__ __volatile__ (
+                          "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t" // use an upscaling operator to clear our register.
+                          "vmovaps\t\t%%zmm0,\t%[RES]\n\t"
+                          : [RES]  "+m"  (*target)
+                          : [Z]     "m"  (zero)
+                          : "zmm0", "memory");
+}
+
+/* Convert a FP16 to a FP32. */
+float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src)
+{
+    // we only care aboun one result.
+    uint32_t mask=0x0001;
+
+    // we declare this as an array, so it ends up in a different memory section.
+    float f32[1] __attribute__((aligned(64)));
+
+    __asm__ __volatile__ (
+                          "kmov\t%[M],\t%%k1\n\t"
+                          "vbroadcastss\t%[SRC]%{float16%},\t%%zmm1%{%%k1%}\n\t"
+                          "vmovaps\t\t%%zmm1,\t%[DST]%{%%k1%}\n\t"
+                          : [DST] "+m"  (f32)
+                          : [SRC]  "m"  (src),
+                            [M]    "r"  (mask)
+                          : "zmm1", "memory", "k1");
+    return f32[0];
+}
+
+/* convert many FP16s to FP32s. */
+void GGML_PHI_FP16_TO_FP32_ROW(const ggml_fp16_t * x, float * y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        y[i] = GGML_PHI_FP16_TO_FP32(x[i]);
+    }
+}
+
+/* Convert a FP32 to a FP16. */
+ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src)
+{
+    uint32_t mask=0x0001;
+
+    // we declare this as an array, so it ends up in a different memory section.
+    ggml_fp16_t f16[1] __attribute__((aligned(64)));
+
+    __asm__ __volatile__ (
+                          "kmov\t%[M],\t%%k1\n\t"
+                          "vbroadcastss\t%[SRC],\t%%zmm2%{%%k1%}\n\t"
+                          "vmovaps\t\t%%zmm2%{float16%},\t%[DST]%{%%k1%}\n\t"
+                          : [DST]  "+m"  (f16)
+                          : [SRC]   "m"  (src),
+                            [M]     "r"  (mask)
+                          : "zmm2", "memory", "k1");
+    return f16[0];
+}
+
+/* convert many FP32s to FP16s. */
+void GGML_PHI_FP32_TO_FP16_ROW(const float * x, ggml_fp16_t * y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        y[i] = GGML_PHI_FP32_TO_FP16(x[i]);
+    }
+}
+
+// This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. Then it does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, It multiplies this I32x16 by a float, returning a F32x16.
+// It loops 8 times. Well, actually four, with an unroll.
+// Handles q8 being aligned incorrectly.
+// Requires q5 to be aligned.
+void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_t *q8, uint8x16_t *q5, const uint8_t *scale, ggml_fp16_t scaleX, float scaleY, float32x16_t *res)
+{
+    uint8_t zero = 0;
+    uint64_t q8offset=((uint64_t) q8) & 0x3f;
+
+    __asm__ __volatile__ (
+                          "vprefetchenta\t(%[RES])\n\t"                       // Issue our memory requests first thing.
+                          "vprefetch0\t64(%[SCALE])\n\t"
+                          "vprefetch0\t(%[SRC8])\n\t"
+                          "vprefetch0\t64(%[SRC8])\n\t"
+                          "vprefetch0\t(%[SRC5])\n\t"
+                          "mov\t%[SRC8],\t%%r11\n\t"                          // Use r11 to store the address for vloadunpackld.
+                          "mov\t%[SRC5],\t%%r8\n\t"
+                          "mov\t%[SCALE],\t%%r9\n\t"
+                          "mov\t$0,\t%%ecx\n\t"
+                          "mov\t%[SRC8],\t%%r15\n\t"                          // Use r12-r15 to store the addresses for vloadunpackhd.
+                          "mov\t%[SRC8],\t%%r14\n\t"
+                          "mov\t%[SRC8],\t%%r13\n\t"
+                          "mov\t%[SRC8],\t%%r12\n\t"
+                          "mov\t%[OFFSET],\t%%r10\n\t"
+                          "cmp\t$32,%%r10\n\t"                                // Examine OFFSET, and decide which (if any) of the vloadunpackhd invocations needs to be increased by 64.
+                          "jl\t10f\n\t"
+                          "cmp\t$48,%%r10\n\t"
+                          "jl\t11f\n\t"
+                          "add\t$64,%%r12\n\t"                                // Greater than 47.
+                          "jmp\t14f\n\t"
+                          "11:\n\t"
+                          "add\t$64,%%r13\n\t"                                // Between 48 and 31.
+                          "jmp\t14f\n\t"
+                          "10:\n\t"                                           // Less than 32...
+                          "cmp\t$16,%%r10\n\t"
+                          "jz\t14f\n\t"                                       // Zero.
+                          "jl\t13f\n\t"
+                          "add\t$64,%%r14\n\t"                                // Between 32 and 15.
+                          "jmp\t14f\n\t"
+                          "13:\n\t"
+                          "add\t$64,%%r15\n\t"                                // Between 16 and zero.
+                          "14:\n\t"
+                          "vbroadcastss\t%[SCALEY],\t%%zmm3\n\t"              // Load the scale factors coresponding to the two input vectors.
+                          "vbroadcastss\t%[SCALEX]%{float16%},\t%%zmm4\n\t"
+                          "vmulps\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"              // Prepare the factor we're going to multiply the result by..
+                          "vmovaps\t\t(%[RES]),\t%%zmm6\n\t"                  // Load our inital state from sum..
+                          "vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"          // Empty our result.
+                          "1:\n\t"
+                          "inc\t%%ecx\n\t"                                    // We are in our loop, increment our counter.
+                          "vloadunpackld\t\t(%%r11)%{sint8%},\t%%zmm8\n\t"    // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "vloadunpackld\t\t16(%%r11)%{sint8%},\t%%zmm9\n\t"  // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "vloadunpackld\t\t32(%%r11)%{sint8%},\t%%zmm10\n\t" // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "vloadunpackld\t\t48(%%r11)%{sint8%},\t%%zmm11\n\t" // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "vprefetch1\t128(%%r11)\n\t"                        // Prepare for a run-through.
+                          "add\t$64,\t%%r11\n\t"
+                          "vloadunpackhd\t\t(%%r12)%{sint8%},\t%%zmm8\n\t"    // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "add\t$64,\t%%r12\n\t"
+                          "vloadunpackhd\t\t16(%%r13)%{sint8%},\t%%zmm9\n\t"  // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "add\t$64,\t%%r13\n\t"
+                          "vloadunpackhd\t\t32(%%r14)%{sint8%},\t%%zmm10\n\t" // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "add\t$64,\t%%r14\n\t"
+                          "vloadunpackhd\t\t48(%%r15)%{sint8%},\t%%zmm11\n\t" // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "add\t$64,\t%%r15\n\t"
+                          "vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm12\n\t"        // Load the item we will be multiplying with. Upscale it from int8 to int32.
+                          "vpmulld\t%%zmm8,\t%%zmm12,\t%%zmm13\n\t"           // Perform our 64 bit multiply, low side.
+                          "vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm14\n\t"      // Load the item we will be multiplying with. Upscale it from int8 to int32.
+                          "vpmulld\t%%zmm9,\t%%zmm14,\t%%zmm15\n\t"           // Perform our 64 bit multiply, low side.
+                          "vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm0\n\t"       // Load the item we will be multiplying with. Upscale it from int8 to int32.
+                          "vpmulld\t%%zmm10,\t%%zmm0,\t%%zmm1\n\t"            // Perform our 64 bit multiply, low side.
+                          "vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm2\n\t"       // Load the item we will be multiplying with. Upscale it from int8 to int32.
+                          "vpmulld\t%%zmm11,\t%%zmm2,\t%%zmm3\n\t"            // Perform our 64 bit multiply, low side.
+                          "vprefetch1\t64(%%r8)\n\t"                          // Prepare for a run-through.
+                          "add\t$64,\t%%r8\n\t"
+                          "vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm4\n\t"        // Load the item we will be multiplying by.
+                          "vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm8\n\t"       // Load the item we will be multiplying by.
+                          "vprefetch1\t2(%%r9)\n\t"
+                          "add\t$2,\t%%r9\n\t"
+                          "vprefetch0\t(%%r11)\n\t"                           // Prepare for a run-through.
+                          "vprefetch0\t64(%%r11)\n\t"                         // Prepare for a run-through.
+                          "vprefetch0\t(%%r8)\n\t"                            // Prepare for a run-through.
+                          "vprefetch0\t(%%r9)\n\t"                            // Prepare for a run-through.
+                          "cmp\t$4,\t%%ecx\n\t"                               // See if this is our last run-through.
+                          "vpmadd231d\t%%zmm13,\t%%zmm4,\t%%zmm7\n\t"         // Perform our multiply-add.
+                          "vpmadd231d\t%%zmm15,\t%%zmm4,\t%%zmm7\n\t"         // Perform our multiply-add.
+                          "vpmadd231d\t%%zmm1,\t%%zmm8,\t%%zmm7\n\t"          // Perform our multiply-add.
+                          "vpmadd231d\t%%zmm3,\t%%zmm8,\t%%zmm7\n\t"          // Perform our multiply-add.
+                          "jl\t1b\n\t"
+                          "vcvtfxpntdq2ps\t$0,%%zmm7,\t%%zmm9\n\t"            // Convert our ints to floats.
+                          "vfmadd231ps\t%%zmm5,\t%%zmm9,\t%%zmm6\n\t"         // Perform a fused multiply add.
+                          "vmovaps\t\t%%zmm6,\t(%[RES])\n\t"                  // Save the result.
+                          : [RES]   "+r" (res)
+                          : [SRC8]   "r" (q8),
+                            [OFFSET] "m" (q8offset),
+                            [SRC5]   "r" (q5),
+                            [SCALE]  "r" (scale),
+                            [SCALEX] "m" (scaleX),
+                            [SCALEY] "m" (scaleY),
+                            [Z]      "m" (zero)
+                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "cc", "ecx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory");
+}
+
+// Unpack 256 unsigned 5 bit values into an 8 bit vector.
+// Handles q4 being aligned incorrectly.
+// Requires dst to be aligned.
+void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst)
+{
+    uint8_t lowmask = 0x0F;
+    uint8_t m=1;
+    uint8_t bit5 = 0x10;
+    uint64_t q4offset=((uint64_t) q4) & 0x3f;
+
+    __asm__ __volatile__ (
+                          "vprefetch0\t(%[SRC1])\n\t"                         // Issue our memory requests first thing.
+                          "vprefetch0\t(%[SRC4])\n\t"
+                          "vprefetchenta\t(%[DST])\n\t"
+                          "mov\t%[DST],\t%%r8\n\t"                            // Load the address of the head of our destination list.
+                          "mov\t%[SRC4],\t%%r9\n\t"                           // Load the address of the head of our 4-bit list into r9, for vloadunpackld.
+                          "mov\t%[SRC4],\t%%r10\n\t"                          // Load the address of the head of our 4-bit list into r10-r13, for vloadunpackhd.
+                          "mov\t%[SRC4],\t%%r11\n\t"
+                          "mov\t%[SRC4],\t%%r12\n\t"
+                          "mov\t%[SRC4],\t%%r13\n\t"
+                          "mov\t%[OFFSET],\t%%r14\n\t"
+                          "mov\t$0,%%ecx\n\t"                                 // Initialize our counter.
+                          "cmp\t$32,%%r14\n\t"                                // Examine OFFSET, and decide which (if any) of the vloadunpackhd invocations needs to be increased by 64.
+                          "jl\t20f\n\t"
+                          "cmp\t$48,%%r14\n\t"
+                          "jl\t21f\n\t"
+                          "add\t$64,%%r10\n\t"                                // Greater than 47.
+                          "jmp\t24f\n\t"
+                          "21:\n\t"
+                          "add\t$64,%%r11\n\t"                                // Between 48 and 31.
+                          "jmp\t24f\n\t"
+                          "20:\n\t"                                           // Less than 32...
+                          "cmp\t$16,%%r14\n\t"
+                          "jz\t24f\n\t"                                       // Zero.
+                          "jl\t23f\n\t"
+                          "add\t$64,%%r12\n\t"                                // Between 32 and 15.
+                          "jmp\t24f\n\t"
+                          "23:\n\t"
+                          "add\t$64,%%r13\n\t"                                // Between 16 and zero.
+                          "24:\n\t"
+                          "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"       // Load our mask.
+                          "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"        // Load the bit we want to add (conditionally).
+                          "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"          // Select which bit we want to test for. Start with bit 1.
+                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"        // Load 16 sets of 8 packed single bits.
+                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"      // Load the next 16 sets of 8 packed single bits.
+
+                          "3:\n\t"
+                          "inc\t%%ecx\n\t"                                    // We are in the loop. increment the counter.
+
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
+
+                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"     // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t(%%r10)%{uint8%},\t%%zmm5\n\t"    // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"              // Apply a mask, storing the first set of four bits into a vector.
+                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"         // Save our result.
+
+                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t16(%%r11)%{uint8%},\t%%zmm7\n\t"  // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vprefetch1\t32(%%r9)\n\t"                          // Pull the next set of 4 bit sequences into the L2 cache.
+                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"              // Apply a mask, storing the next set of four bits into a vector.
+                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"       // Save our result.
+
+                          "add\t$32,\t%%r8\n\t"
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
+
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
+
+                          "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"                  // Load our even 4 bit sequence.
+                          "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"                  // Load our next even 4 bit sequence.
+                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
+                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"         // Save our result.
+                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"       // Save our result.
+                          "vprefetchenta\t32(%%r8)\n\t"
+
+                          "vprefetch0\t32(%%r9)\n\t"
+                          "vprefetch1\t96(%%r9)\n\t"
+                          "add\t$32,\t%%r8\n\t"
+                          "add\t$32,\t%%r9\n\t"
+                          "add\t$32,\t%%r10\n\t"
+                          "add\t$32,\t%%r11\n\t"
+                          "add\t$32,\t%%r12\n\t"
+                          "add\t$32,\t%%r13\n\t"
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
+
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
+
+                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm9\n\t"     // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t(%%r12)%{uint8%},\t%%zmm9\n\t"    // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vpandd\t%%zmm0,\t%%zmm9,\t%%zmm10\n\t"             // Apply a mask, storing the first set of four bits into a vector.
+                          "vpord\t%%zmm1,%%zmm10,%%zmm10%{%%k1%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm10%{uint8%},\t(%%r8)\n\t"        // Save our result.
+
+                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm11\n\t"  // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t16(%%r13)%{uint8%},\t%%zmm11\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vprefetch1\t32(%%r9)\n\t"                          // Pull the next set of 4 bit sequences into the L2 cache.
+                          "vpandd\t%%zmm0,\t%%zmm11,\t%%zmm12\n\t"            // Apply a mask, storing the next set of four bits into a vector.
+                          "vpord\t%%zmm1,%%zmm12,%%zmm12%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm12%{uint8%},\t16(%%r8)\n\t"      // Save our result.
+
+                          "add\t$32,\t%%r8\n\t"
+                          "cmp\t$2,\t%%ecx\n\t"
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
+
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
+
+                          "vpsrld\t$4,\t%%zmm9,\t%%zmm10\n\t"                 // Load our even 4 bit sequence.
+                          "vpsrld\t$4,\t%%zmm11,\t%%zmm12\n\t"                // Load our next even 4 bit sequence.
+                          "vpord\t%%zmm1,%%zmm10,%%zmm10%{%%k1%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vpord\t%%zmm1,%%zmm12,%%zmm12%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm10%{uint8%},\t(%%r8)\n\t"        // Save our result.
+                          "vmovdqa32\t\t%%zmm12%{uint8%},\t16(%%r8)\n\t"      // Save our result.
+                          "vprefetchenta\t32(%%r8)\n\t"
+
+                          "je\t2f\n\t"
+
+                          "vprefetch0\t32(%%r9)\n\t"
+                          "vprefetch1\t96(%%r9)\n\t"
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
+                          "add\t$32,\t%%r8\n\t"
+                          "add\t$32,\t%%r9\n\t"
+                          "add\t$32,\t%%r10\n\t"
+                          "add\t$32,\t%%r11\n\t"
+                          "add\t$32,\t%%r12\n\t"
+                          "add\t$32,\t%%r13\n\t"
+                          "jmp\t3b\n\t"
+                          "2:"
+                          : [DST]   "+r" (dst)
+                          : [SRC4]   "r" (q4),
+                            [OFFSET] "m" (q4offset),
+                            [SRC1]   "r" (q1),
+                            [MASK]   "m" (lowmask),
+                            [M]      "m" (m),
+                            [BIT5]   "m" (bit5)
+                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory");
+}
+
+// A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.
+// Used during inference, if your model prints "llama_model_loader: - type q5_K:  XXX tensors", and XXX is not zero. :)
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+
+    /* Interpret X and Y as vectors. */
+    const block_q5_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
+
+    /* The number of blocks we will process this in. */
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float32x16_t sums;
+
+    // Clear sums.
+    GGML_F32x16_VEC_ZERO(&sums);
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        uint8x16_t q5 [QK_K/16];
+
+        // Combine our 4 and 1 bit vector sets into a 5 bit vector (in 8 bits).
+        GGML_5bit_Unpack_Unaligned((const uint8x16_t *)x[i].qs, x[i].qh, q5);
+
+        // Extract scales and mins..
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+
+        // FIXME: While comparing FMA output to the original output, the original had an error. Hunt it down.
+        GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned((const int8x16_t *)y[i].qs, q5, scales, x[i].d, y[i].d, &sums);
+
+        const float dmin = GGML_PHI_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+
+    for (int l = 0; l < GGML_F32_EPR; ++l) sumf += ((float *)&sums)[l];
+    *s = sumf;
+}
diff --git a/ggml-phi-knc-dot_q5_K_q8_K.h b/ggml-phi-knc-dot_q5_K_q8_K.h
new file mode 100644
index 0000000000000..820cdf95bae99
--- /dev/null
+++ b/ggml-phi-knc-dot_q5_K_q8_K.h
@@ -0,0 +1,41 @@
+// Formatted with: indent -npcs -nlp -i4 -l300
+/* Formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
+/* Formatted by using emacs, with (M-x set-variable RET indent-tabs-mode RET nil RET) executed. */
+
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C"
+{
+#endif
+
+    /* A forward declaration, to keep GCC happy. */
+    void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict s, size_t bs, const void * __restrict vx, size_t bx, const void * __restrict vy, size_t by, int nrc);
+
+    // Define our vector types, with a default alignment.
+    typedef float float32x16_t __attribute__((vector_size (64), aligned(64)));
+    typedef int8_t int8x16_t __attribute__((vector_size (16), aligned(16)));
+    typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
+    typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64)));
+
+    // Zero out a vector of 16 Floats.
+    void GGML_F32x16_VEC_ZERO(float32x16_t *target);
+    // Convert an FP16 value to FP32(Float).
+    float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src);
+    // Convert a set of FP16 values to FP32(Float).
+    void GGML_PHI_FP16_TO_FP32_ROW(const ggml_fp16_t * x, float * y, int n);
+    // Convert an FP32(Float) value to FP16.
+    ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src);
+    // Convert an FP32(Float) value to FP16.
+    void GGML_PHI_FP32_TO_FP16_ROW(const float * x, ggml_fp16_t * y, int n);
+
+    // Create a 5 bit int vector from a 4 bit vector and a 1 bit vector, both in packed forms.
+    void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst);
+    // Multiply a Q5 and Q8 vector against each other, with some scaling.
+    void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_t *q8, uint8x16_t *q5, const uint8_t *scale, ggml_fp16_t scaleX, float scaleY, float32x16_t *res);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
new file mode 100644
index 0000000000000..c4cc49724e244
--- /dev/null
+++ b/ggml-phi-knc.c
@@ -0,0 +1,157 @@
+/* Xeon PHI IMCI support. */
+/* Formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
+/* Formatted by using emacs, with (M-x set-variable RET indent-tabs-mode RET nil RET) executed. */
+
+#include <stdint.h>
+
+// For size_t
+#include <stdio.h>
+
+// For memcpy.
+#include <string.h>
+
+// We can fit 16 of these float32s in a single vector register.
+#define GGML_F32_EPR 16
+
+// A vector of 16 floats.
+typedef float float32x16_t __attribute__((vector_size (64), aligned (64)));
+
+// A forward declaration, to keep GCC happy...
+void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+
+inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
+{
+    uint8_t zero = 0;
+
+    __asm__ __volatile__ (
+                          "vbroadcastss\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                          "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
+                          : [RES]  "+m"  (*target)
+                          : [Z]     "m"  (zero)
+                          : "zmm8", "memory");
+}
+
+// Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. Optionally clear the sum before starting.
+inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations, int clear)
+{
+    uint8_t zero = 0;
+
+    __asm__ __volatile__ (
+                          "vprefetchenta\t(%[RES])\n\t"
+                          "vprefetch0\t(%[VEC1])\n\t"
+                          "vprefetch1\t64(%[VEC1])\n\t"
+                          "vprefetch0\t128(%[VEC1])\n\t"
+                          "vprefetch1\t192(%[VEC1])\n\t"
+                          "vprefetch0\t(%[VEC2])\n\t"
+                          "vprefetch1\t64(%[VEC2])\n\t"
+                          "vprefetch0\t128(%[VEC2])\n\t"
+                          "vprefetch1\t192(%[VEC2])\n\t"
+                          "mov\t%[ITER],%%r8\n\t"                       // How many vector sized chunks are we responsible for?
+                          "mov\t%[VEC1],%%r10\n\t"                      // Where do we start work in mvec1?
+                          "mov\t%[VEC2],%%r12\n\t"                      // Where do we start work in mvec2?
+                          "cmp\t$0,%[CLR]\n\t"                          // Should we clear the sum before we start?
+                          "jz\t4f\n\t"
+                          "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t"    // If so, use an upscaling operator to clear our sum.
+                          "jmp\t5f\n\t"
+                          "4:\n\t"
+                          "vprefetch0\t(%[RES])\n\t"
+                          "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"            // Otherwise, load our inital state from sum..
+                          "vprefetchnta\t(%%r10)\n\t"
+                          "vprefetchnta\t(%%r12)\n\t"
+                          "5:\n\t"
+                          "cmp\t$4,\t%%r8\n\t"                          // Compare iterations to four.
+                          "jnae\t6f\n\t"                                // If there are not four iterations left, jump to label 6.
+                          "1:\n\t"
+                          "sub\t$4,\t%%r8\n\t"                          // Decrement iterations
+                          "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
+                          "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
+                          "vprefetchnta\t192(%%r10)\n\t"                // prefetch the next float32x16_t block (192 bytes ahead)
+                          "vprefetchnta\t192(%%r12)\n\t"
+                          "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
+                          "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
+                          "vprefetch1\t320(%%r10)\n\t"                  // prefetch the block after the block after the next float32x16_t block (320 bytes ahead)
+                          "vprefetch1\t320(%%r12)\n\t"
+                          "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"          // Load two vectors.
+                          "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
+                          "vprefetch1\t576(%%r10)\n\t"
+                          "vprefetch1\t576(%%r12)\n\t"
+                          "vprefetch1\t704(%%r10)\n\t"
+                          "vprefetch1\t704(%%r12)\n\t"
+                          "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "vmovaps\t\t192(%%r10),\t%%zmm7\n\t"          // Load two vectors.
+                          "vmovaps\t\t192(%%r12),\t%%zmm8\n\t"
+                          "vfmadd231ps\t%%zmm7,\t%%zmm8,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "add\t$256,\t%%r10\n\t"                       // Move to the next 4xfloat32x16_t block (256 bytes ahead)
+                          "add\t$256,\t%%r12\n\t"
+                          "cmp\t$4,\t%%r8\n\t"                          // Compare iteration count to four.
+                          "jge\t1b\n\t"                                 // If there are four or more iterations left, loop.
+                          "6:\n\t"                                      // We know we are near the tail. handle 3, 2, 1, and 0 cases.
+                          "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
+                          "jz\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)
+                          "cmp\t$1,\t%%r8\n\t"                          // Compare iterations to one
+                          "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
+                          "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
+                          "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "je\t2f\n\t"                                  // Jump to label 2 if one (end of loop)
+                          "cmp\t$2,\t%%r8\n\t"                          // Compare iterations to two
+                          "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
+                          "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
+                          "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "je\t2f\n\t"                                  // Jump to label 2 if two (end of loop)
+                          // No compare. we must be three.
+                          "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"          // Load two vectors.
+                          "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
+                          "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "2:\n\t"                                      // Label for loop end
+                          "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"          // Save our results.
+                          : [RES]  "+r" (sumvec)
+                          : [ITER]  "r"  (iterations),
+                            [VEC1]  "r"  (mvec1),
+                            [VEC2]  "r"  (mvec2),
+                            [CLR]   "r"  (clear),
+                            [Z]     "m"  (zero)
+                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "memory", "r8", "r10", "r12");
+}
+
+// Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. uses masks to handle just the last run-through.
+inline static void GGML_F32x16_VEC_FMA_TAIL(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t items)
+{
+    uint32_t mask = (0x00000001 << items)-1;
+
+    __asm__ __volatile__ (
+                          "vprefetchnta\t(%[VEC1])\n\t"
+                          "vprefetchnta\t(%[VEC2])\n\t"
+                          "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"                  // Load our inital state from sum..
+                          "kmov\t%[MASK],%%k1\n\t"                            // Load a mask that we will use to just operate on part of a vector..
+                          "vmovaps\t\t(%[VEC1]),\t%%zmm1%{%%k1%}\n\t"         // Partially two vectors.
+                          "vmovaps\t\t(%[VEC2]),\t%%zmm2%{%%k1%}\n\t"
+                          "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0%{%%k1%}\n\t" // Perform a fused multiply add
+                          "vmovnraps\t\t%%zmm0,\t(%[RES])%{%%k1%}\n\t"        // save our results.
+                          : [RES]  "+r" (sumvec)
+                          : [VEC1]  "r"  (mvec1),
+                            [VEC2]  "r"  (mvec2),
+                            [MASK]  "r"  (mask)
+                          : "zmm0", "zmm1", "zmm2", "k1", "memory");
+}
+
+// NOTE: x and y inputs must be __attribute__((aligned(64)));
+void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc)
+{
+    // our sum.
+    float32x16_t sum;
+
+    // the number of vector-sized steps we will need to do.
+    const uint32_t np = (n & ~(GGML_F32_EPR - 1));
+
+    GGML_F32x16_VEC_FMA((const float32x16_t *)x, (const float32x16_t *)y, &sum, np/GGML_F32_EPR, 1);
+
+    // add the leftovers, that could not be handled by the whole vector loop.
+    if ( n - np != 0 ) GGML_F32x16_VEC_FMA_TAIL((const float32x16_t *)&x[np], (const float32x16_t *)&y[np], &sum, n-np);
+
+    // reduce sum, and store it in s.
+    for (uint32_t i=0; i < GGML_F32_EPR; ++i)
+        *s+=((float *)&sum)[i];
+
+}
diff --git a/ggml-phi-knc.h b/ggml-phi-knc.h
new file mode 100644
index 0000000000000..94d444627c37c
--- /dev/null
+++ b/ggml-phi-knc.h
@@ -0,0 +1,16 @@
+// Formatted with: indent -npcs -nlp -i4 -l300
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C"
+{
+#endif
+
+    /* A forward declaration, to keep GCC happy. */
+    void ggml_vec_dot_f32(int n, float *restrict s, size_t bs, const float *restrict x, size_t bx, const float *restrict y, size_t by, int nrc);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml-quants.c b/ggml-quants.c
index 32e84434a8c1b..4ae01085800f4 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -4,6 +4,7 @@
 #include "ggml-quants.h"
 #include "ggml-impl.h"
 
+// FIXME: why do we import this twice?
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 
@@ -49,6 +50,11 @@
 #include <riscv_vector.h>
 #endif
 
+// hand assembled replacement functions are cool.
+#if defined(__k1om__)
+#include <ggml-phi-knc-dot_q5_K_q8_K.h>
+#endif
+
 #undef MIN
 #undef MAX
 
@@ -3636,7 +3642,7 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
     quantize_row_q8_K_reference(x, y, k);
 }
 
-//===================================== Dot ptoducts =================================
+//===================================== Dot products =================================
 
 //
 // Helper functions
@@ -7153,6 +7159,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 }
 #endif
 
+#if defined(__k1om__)
+/* We get this from elsewhere. */
+#else
 #if QK_K == 256
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
     assert(n % QK_K == 0);
@@ -7577,7 +7586,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 #endif
 }
 
-#else
+#else /* QK_K != 256 */
 
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
@@ -7846,8 +7855,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     *s = sumf;
 #endif
 }
-#endif
+#endif /* end QK_K != 256 */
 
+#endif /* defined(__k1om__) */
 
 #if QK_K == 256
 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
diff --git a/ggml.c b/ggml.c
index 14288d29ded6b..f21dace464765 100644
--- a/ggml.c
+++ b/ggml.c
@@ -42,6 +42,12 @@
 #pragma warning(disable: 4996)
 #endif
 
+// hand assembled replacement functions are cool.
+#if defined(__k1om__)
+#include <ggml-phi-knc.h>
+#include <ggml-phi-knc-dot_q5_K_q8_K.h>
+#endif
+
 #if defined(_WIN32)
 
 #define WIN32_LEAN_AND_MEAN
@@ -330,6 +336,14 @@ const char * ggml_status_to_string(enum ggml_status status) {
 
 // note: do not use these inside ggml.c
 // these are meant to be used via the ggml.h API
+#if defined(__k1om__)
+
+#define ggml_fp16_to_fp32 GGML_PHI_FP16_TO_FP32
+#define ggml_fp32_to_fp16 GGML_PHI_FP32_TO_FP16
+#define ggml_fp16_to_fp32_row GGML_PHI_FP16_TO_FP32_ROW
+#define ggml_fp32_to_fp16_row GGML_PHI_FP32_TO_FP16_ROW
+
+#else
 float ggml_fp16_to_fp32(ggml_fp16_t x) {
     return GGML_FP16_TO_FP32(x);
 }
@@ -363,6 +377,8 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
     }
 }
 
+#endif /* defined(__k1om__) */
+
 bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
     return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
 }
@@ -496,7 +512,11 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
 
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
+#if defined(__k1om__)
+// We get this function from elsewhere.
+#else
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+#endif
 static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
 
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
@@ -1498,6 +1518,9 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
+#if defined(__k1om__)
+// we get this function from elsewhere.
+#else
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
@@ -1540,6 +1563,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
 
     *s = sumf;
 }
+#endif
 
 static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
     assert(nrc == 1);
@@ -2329,9 +2353,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
 #   if !defined(SYS_getcpu) && defined(SYS_get_cpu)
 #       define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
 #   endif
+#   if defined(SYS_getcpu)
     getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
 #endif
-
+#endif
     if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
         g_state.numa.n_nodes = 0;
         return;
@@ -21766,4 +21791,12 @@ int ggml_cpu_has_matmul_int8(void) {
 #endif
 }
 
+int ggml_cpu_is_xeonphi_knc(void) {
+#if defined(__k1om__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/ggml.h b/ggml.h
index e9ed8eeee7919..c1f61ccbac131 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2369,6 +2369,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_sycl       (void);
     GGML_API int ggml_cpu_has_vsx        (void);
     GGML_API int ggml_cpu_has_matmul_int8(void);
+    GGML_API int ggml_cpu_is_xeonphi_knc (void);
 
     //
     // Internal types and functions exposed for tests and benchmarks
diff --git a/llama.cpp b/llama.cpp
index a5ef2fd8fa575..2a1631a63841b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7,6 +7,15 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
+// hand assembled replacement functions are cool.
+#if defined(__k1om__)
+#include "ggml-phi-knc-dot_q5_K_q8_K.h"
+
+#define ggml_fp16_to_fp32_row GGML_PHI_FP16_TO_FP32_ROW
+#define ggml_fp32_to_fp16_row GGML_PHI_FP32_TO_FP16_ROW
+
+#endif
+
 #ifdef GGML_USE_CUDA
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
@@ -16779,6 +16788,7 @@ const char * llama_print_system_info(void) {
     s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
     s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
     s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
+    s += "XEONPHI_KNC = " + std::to_string(ggml_cpu_is_xeonphi_knc())  + " | ";
 
     return s.c_str();
 }