diff --git a/src/abi_aarch64.cpp b/src/abi_aarch64.cpp
new file mode 100644
index 0000000000000..07b60314a1a89
--- /dev/null
+++ b/src/abi_aarch64.cpp
@@ -0,0 +1,282 @@
+// This file is a part of Julia. License is MIT: http://julialang.org/license
+
+//===----------------------------------------------------------------------===//
+//
+// The ABI implementation used for AArch64 targets.
+//
+//===----------------------------------------------------------------------===//
+//
+// The Procedure Call Standard can be found here:
+// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+//
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+typedef bool AbiState;
+static const AbiState default_abi_state = 0;
+
+static Type *get_llvm_fptype(jl_datatype_t *dt)
+{
+    // Assume jl_is_datatype(dt) && !jl_is_abstracttype(dt)
+    if (dt->mutabl || jl_datatype_nfields(dt) >= 0)
+        return NULL;
+    Type *lltype;
+    // Check size first since it's cheaper.
+    switch (dt->size) {
+    case 2:
+        lltype = T_float16;
+        break;
+    case 4:
+        lltype = T_float32;
+        break;
+    case 8:
+        lltype = T_float64;
+        break;
+    case 16:
+        lltype = T_float128;
+        break;
+    default:
+        return NULL;
+    }
+    return jl_is_floattype(dt) ? lltype : NULL;
+}
+
+// Whether a type is a homogeneous floating-point aggregates (HFA) or a
+// homogeneous short-vector aggregates (HVA). Returns the number of members.
+// We only handle HFA of HP, SP and DP here since these are the only ones we
+// have (no QP).
+static size_t isHFAorHVA(jl_datatype_t *dt)
+{
+    // Assume jl_is_datatype(dt) && !jl_is_abstracttype(dt)
+
+    // An Homogeneous Floating-point Aggregate (HFA) is an Homogeneous Aggregate
+    // with a Fundamental Data Type that is a Floating-Point type and at most
+    // four uniquely addressable members.
+    // An Homogeneous Short-Vector Aggregate (HVA) is an Homogeneous Aggregate
+    // with a Fundamental Data Type that is a Short-Vector type and at most four
+    // uniquely addressable members.
+    size_t members = jl_datatype_nfields(dt);
+    if (members < 1 || members > 4)
+        return 0;
+    // There's at least one member
+    jl_value_t *ftype = jl_field_type(dt, 0);
+    if (!get_llvm_fptype((jl_datatype_t*)ftype))
+        return 0;
+    for (size_t i = 1;i < members;i++) {
+        if (ftype != jl_field_type(dt, i)) {
+            return 0;
+        }
+    }
+    return members;
+}
+
+void needPassByRef(AbiState*, jl_value_t *ty, bool *byRef, bool*)
+{
+    // Assume jl_is_datatype(ty) && !jl_is_abstracttype(ty)
+    jl_datatype_t *dt = (jl_datatype_t*)ty;
+    // B.2
+    //   If the argument type is an HFA or an HVA, then the argument is used
+    //   unmodified.
+    if (isHFAorHVA(dt))
+        return;
+    // B.3
+    //   If the argument type is a Composite Type that is larger than 16 bytes,
+    //   then the argument is copied to memory allocated by the caller and the
+    //   argument is replaced by a pointer to the copy.
+    // We only check for the total size and not whether it is a composite type
+    // since there's no corresponding C type and we just treat such large
+    // bitstype as a composite type of the right size.
+    *byRef = dt->size > 16;
+    // B.4
+    //   If the argument type is a Composite Type then the size of the argument
+    //   is rounded up to the nearest multiple of 8 bytes.
+}
+
+bool need_private_copy(jl_value_t*, bool)
+{
+    return false;
+}
+
+// Determine which kind of register the argument will be passed in and
+// if the argument has to be passed on stack (including by reference).
+//
+// If the argument should be passed in SIMD and floating-point registers,
+// we may need to rewrite the argument types to [n x ftype].
+// If the argument should be passed in general purpose registers, we may need
+// to rewrite the argument types to [n x i64].
+//
+// If the argument has to be passed on stack, we need to use sret.
+//
+// All the out parameters should be default to `false`.
+static void classify_arg(jl_value_t *ty, bool *fpreg, bool *onstack,
+                         bool *need_rewrite)
+{
+    // Assume jl_is_datatype(ty) && !jl_is_abstracttype(ty)
+    jl_datatype_t *dt = (jl_datatype_t*)ty;
+
+    // Based on section 5.4 C of the Procedure Call Standard
+    // C.1
+    //   If the argument is a Half-, Single-, Double- or Quad- precision
+    //   Floating-point or Short Vector Type and the NSRN is less than 8, then
+    //   the argument is allocated to the least significant bits of register
+    //   v[NSRN]. The NSRN is incremented by one. The argument has now been
+    //   allocated.
+    // Note that this is missing QP float as well as short vector types since we
+    // don't really have those types.
+    if (get_llvm_fptype(dt)) {
+        *fpreg = true;
+        return;
+    }
+
+    // C.2
+    //   If the argument is an HFA or an HVA and there are sufficient
+    //   unallocated SIMD and Floating-point registers (NSRN + number of
+    //   members <= 8), then the argument is allocated to SIMD and
+    //   Floating-point Registers (with one register per member of the HFA
+    //   or HVA). The NSRN is incremented by the number of registers used.
+    //   The argument has now been allocated.
+    if (isHFAorHVA(dt)) { // HFA and HVA have <= 4 members
+        *fpreg = true;
+        *need_rewrite = true;
+        return;
+    }
+
+    // Check if the argument needs to be passed by reference. This should be
+    // done before starting step C but we do this here to avoid checking for
+    // HFA and HVA twice. We don't check whether it is a composite type.
+    // See `needPassByRef` above.
+    if (dt->size > 16) {
+        *onstack = true;
+        return;
+    }
+
+    // C.3
+    //   If the argument is an HFA or an HVA then the NSRN is set to 8 and the
+    //   size of the argument is rounded up to the nearest multiple of 8 bytes.
+    // C.4
+    //   If the argument is an HFA, an HVA, a Quad-precision Floating-point or
+    //   Short Vector Type then the NSAA is rounded up to the larger of 8 or
+    //   the Natural Alignment of the argument’s type.
+    // C.5
+    //   If the argument is a Half- or Single- precision Floating Point type,
+    //   then the size of the argument is set to 8 bytes. The effect is as if
+    //   the argument had been copied to the least significant bits of a 64-bit
+    //   register and the remaining bits filled with unspecified values.
+    // C.6
+    //   If the argument is an HFA, an HVA, a Half-, Single-, Double- or
+    //   Quad- precision Floating-point or Short Vector Type, then the argument
+    //   is copied to memory at the adjusted NSAA. The NSAA is incremented
+    //   by the size of the argument. The argument has now been allocated.
+    // <already included in the C.2 case above>
+    // C.7
+    //   If the argument is an Integral or Pointer Type, the size of the
+    //   argument is less than or equal to 8 bytes and the NGRN is less than 8,
+    //   the argument is copied to the least significant bits in x[NGRN].
+    //   The NGRN is incremented by one. The argument has now been allocated.
+    // Here we treat any bitstype of the right size as integers or pointers
+    // This is needed for types like Cstring which should be treated as
+    // pointers. We don't need to worry about floating points here since they
+    // are handled above.
+    if (jl_is_immutable(dt) && jl_datatype_nfields(dt) == 0 &&
+        (dt->size == 1 || dt->size == 2 || dt->size == 4 ||
+         dt->size == 8 || dt->size == 16))
+        return;
+
+    // C.8
+    //   If the argument has an alignment of 16 then the NGRN is rounded up to
+    //   the next even number.
+    // C.9
+    //   If the argument is an Integral Type, the size of the argument is equal
+    //   to 16 and the NGRN is less than 7, the argument is copied to x[NGRN]
+    //   and x[NGRN+1]. x[NGRN] shall contain the lower addressed double-word
+    //   of the memory representation of the argument. The NGRN is incremented
+    //   by two. The argument has now been allocated.
+    // <merged into C.7 above>
+    // C.10
+    //   If the argument is a Composite Type and the size in double-words of
+    //   the argument is not more than 8 minus NGRN, then the argument is
+    //   copied into consecutive general-purpose registers, starting at x[NGRN].
+    //   The argument is passed as though it had been loaded into the registers
+    //   from a double-word-aligned address with an appropriate sequence of LDR
+    //   instructions loading consecutive registers from memory (the contents of
+    //   any unused parts of the registers are unspecified by this standard).
+    //   The NGRN is incremented by the number of registers used. The argument
+    //   has now been allocated.
+    // We don't check for composite types here since the ones that have
+    // corresponding C types are already handled and we just treat the ones
+    // with weird size as a black box composite type.
+    // The type can fit in 8 x 8 bytes since it is handled by
+    // need_pass_by_ref otherwise.
+    *need_rewrite = true;
+
+    // C.11
+    //   The NGRN is set to 8.
+    // C.12
+    //   The NSAA is rounded up to the larger of 8 or the Natural Alignment
+    //   of the argument’s type.
+    // C.13
+    //   If the argument is a composite type then the argument is copied to
+    //   memory at the adjusted NSAA. The NSAA is incremented by the size of
+    //   the argument. The argument has now been allocated.
+    // <handled by C.10 above>
+    // C.14
+    //   If the size of the argument is less than 8 bytes then the size of the
+    //   argument is set to 8 bytes. The effect is as if the argument was
+    //   copied to the least significant bits of a 64-bit register and the
+    //   remaining bits filled with unspecified values.
+    // C.15
+    //   The argument is copied to memory at the adjusted NSAA. The NSAA is
+    //   incremented by the size of the argument. The argument has now been
+    //   allocated.
+    // <handled by C.10 above>
+}
+
+bool use_sret(AbiState*, jl_value_t *ty)
+{
+    // Assume jl_is_datatype(ty) && !jl_is_abstracttype(ty)
+    // Section 5.5
+    // If the type, T, of the result of a function is such that
+    //
+    //     void func(T arg)
+    //
+    // would require that arg be passed as a value in a register (or set of
+    // registers) according to the rules in section 5.4 Parameter Passing,
+    // then the result is returned in the same registers as would be used for
+    // such an argument.
+    bool fpreg = false;
+    bool onstack = false;
+    bool need_rewrite = false;
+    classify_arg(ty, &fpreg, &onstack, &need_rewrite);
+    return onstack;
+}
+
+Type *preferred_llvm_type(jl_value_t *ty, bool)
+{
+    if (!jl_is_datatype(ty) || jl_is_abstracttype(ty))
+        return NULL;
+    jl_datatype_t *dt = (jl_datatype_t*)ty;
+    if (Type *fptype = get_llvm_fptype(dt))
+        return fptype;
+    bool fpreg = false;
+    bool onstack = false;
+    bool need_rewrite = false;
+    classify_arg(ty, &fpreg, &onstack, &need_rewrite);
+    if (!need_rewrite)
+        return NULL;
+    if (fpreg) {
+        // Rewrite to [n x fptype] where n is the number of field
+        // This only happens for isHFAorHVA
+        size_t members = jl_datatype_nfields(dt);
+        assert(members > 0 && members <= 4);
+        jl_datatype_t *eltype = (jl_datatype_t*)jl_field_type(dt, 0);
+        return ArrayType::get(get_llvm_fptype(eltype), members);
+    }
+    else {
+        // Rewrite to [n x Int64] where n is the **size in dword**
+        assert(dt->size <= 16); // Should be pass by reference otherwise
+        return ArrayType::get(T_int64, (dt->size + 7) >> 3);
+    }
+}
+
+}
diff --git a/src/ccall.cpp b/src/ccall.cpp
index dc10d46cdc802..9803c03799780 100644
--- a/src/ccall.cpp
+++ b/src/ccall.cpp
@@ -141,6 +141,8 @@ static Value *runtime_sym_lookup(PointerType *funcptype, const char *f_lib, cons
 #  else
 #    include "abi_x86.cpp"
 #  endif
+#elif defined _CPU_AARCH64_
+#    include "abi_aarch64.cpp"
 #else
 #  warning "ccall is defaulting to llvm ABI, since no platform ABI has been defined for this CPU/OS combination"
 #  include "abi_llvm.cpp"
@@ -900,8 +902,12 @@ static std::string generate_func_sig(
                 // Note that even though the LLVM argument is called ByVal
                 // this really means that the thing we're passing is pointing to
                 // the thing we want to pass by value
+#ifndef _CPU_AARCH64_
+                // the aarch64 backend seems to interpret ByVal as
+                // implicitly passed on stack.
                 if (byRef)
                     paramattrs[i + sret].addAttribute(Attribute::ByVal);
+#endif
                 if (inReg)
                     paramattrs[i + sret].addAttribute(Attribute::InReg);
                 if (av != Attribute::None)
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index 7dca7cc83d5d0..7a5b0c0b598ac 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -923,15 +923,15 @@ JL_DLLEXPORT Type *julia_type_to_llvm(jl_value_t *jt, bool *isboxed)
         if (jl_is_floattype(jt)) {
 #ifndef DISABLE_FLOAT16
             if (nb == 2)
-                return Type::getHalfTy(jl_LLVMContext);
+                return T_float16;
             else
 #endif
             if (nb == 4)
-                return Type::getFloatTy(jl_LLVMContext);
+                return T_float32;
             else if (nb == 8)
-                return Type::getDoubleTy(jl_LLVMContext);
+                return T_float64;
             else if (nb == 16)
-                return Type::getFP128Ty(jl_LLVMContext);
+                return T_float128;
         }
         return Type::getIntNTy(jl_LLVMContext, nb*8);
     }
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 9d0f8f707bdf8..fdaae6403bb5c 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -246,8 +246,10 @@ static IntegerType *T_uint64;
 static IntegerType *T_char;
 static IntegerType *T_size;
 
+static Type *T_float16;
 static Type *T_float32;
 static Type *T_float64;
+static Type *T_float128;
 
 static Type *T_pint8;
 static Type *T_pint16;
@@ -5512,10 +5514,12 @@ static void init_julia_llvm_env(Module *m)
     else
         T_size = T_uint32;
     T_psize = PointerType::get(T_size, 0);
+    T_float16 = Type::getHalfTy(getGlobalContext());
     T_float32 = Type::getFloatTy(getGlobalContext());
     T_pfloat32 = PointerType::get(T_float32, 0);
     T_float64 = Type::getDoubleTy(getGlobalContext());
     T_pfloat64 = PointerType::get(T_float64, 0);
+    T_float128 = Type::getFP128Ty(getGlobalContext());
     T_void = Type::getVoidTy(jl_LLVMContext);
     T_pvoidfunc = FunctionType::get(T_void, /*isVarArg*/false)->getPointerTo();
 
diff --git a/src/init.c b/src/init.c
index a1a3601969a37..edf4f4a70e200 100644
--- a/src/init.c
+++ b/src/init.c
@@ -801,6 +801,7 @@ void jl_get_builtin_hooks(void)
     jl_uint32_type  = (jl_datatype_t*)core("UInt32");
     jl_uint64_type  = (jl_datatype_t*)core("UInt64");
 
+    jl_float16_type = (jl_datatype_t*)core("Float16");
     jl_float32_type = (jl_datatype_t*)core("Float32");
     jl_float64_type = (jl_datatype_t*)core("Float64");
     jl_floatingpoint_type = (jl_datatype_t*)core("AbstractFloat");
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index c2302016f96ca..4433c0c2a3608 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -61,15 +61,15 @@ static Type *FTnbits(size_t nb)
 {
 #ifndef DISABLE_FLOAT16
     if (nb == 16)
-        return Type::getHalfTy(jl_LLVMContext);
+        return T_float16;
     else
 #endif
     if (nb == 32)
-        return Type::getFloatTy(jl_LLVMContext);
+        return T_float32;
     else if (nb == 64)
-        return Type::getDoubleTy(jl_LLVMContext);
+        return T_float64;
     else if (nb == 128)
-        return Type::getFP128Ty(jl_LLVMContext);
+        return T_float128;
     else
         jl_error("Unsupported Float Size");
 }
@@ -107,7 +107,7 @@ static jl_value_t *JL_JLUINTT(Type *t)
     assert(!t->isIntegerTy());
     if (t == T_float32) return (jl_value_t*)jl_uint32_type;
     if (t == T_float64) return (jl_value_t*)jl_uint64_type;
-    if (t == Type::getHalfTy(jl_LLVMContext)) return (jl_value_t*)jl_uint16_type;
+    if (t == T_float16) return (jl_value_t*)jl_uint16_type;
     assert(t == T_void);
     return jl_bottom_type;
 }
@@ -116,7 +116,7 @@ static jl_value_t *JL_JLSINTT(Type *t)
     assert(!t->isIntegerTy());
     if (t == T_float32) return (jl_value_t*)jl_int32_type;
     if (t == T_float64) return (jl_value_t*)jl_int64_type;
-    if (t == Type::getHalfTy(jl_LLVMContext)) return (jl_value_t*)jl_int16_type;
+    if (t == T_float16) return (jl_value_t*)jl_int16_type;
     assert(t == T_void);
     return jl_bottom_type;
 }
diff --git a/src/jltypes.c b/src/jltypes.c
index db316bbd50dff..ac717be4fbc70 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -50,6 +50,7 @@ jl_datatype_t *jl_int32_type;
 jl_datatype_t *jl_uint32_type;
 jl_datatype_t *jl_int64_type;
 jl_datatype_t *jl_uint64_type;
+jl_datatype_t *jl_float16_type;
 jl_datatype_t *jl_float32_type;
 jl_datatype_t *jl_float64_type;
 jl_datatype_t *jl_floatingpoint_type;
diff --git a/src/julia.h b/src/julia.h
index b93c268f284b9..120783c33e507 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -25,6 +25,9 @@ extern "C" {
 #  define jl_jmp_buf sigjmp_buf
 #  if defined(_CPU_ARM_)
 #    define MAX_ALIGN 8
+#  elif defined(_CPU_AARCH64_)
+// int128 is 16 bytes aligned on aarch64
+#    define MAX_ALIGN 16
 #  else
 #    define MAX_ALIGN sizeof(void*)
 #  endif
@@ -508,6 +511,7 @@ extern JL_DLLEXPORT jl_datatype_t *jl_int32_type;
 extern JL_DLLEXPORT jl_datatype_t *jl_uint32_type;
 extern JL_DLLEXPORT jl_datatype_t *jl_int64_type;
 extern JL_DLLEXPORT jl_datatype_t *jl_uint64_type;
+extern JL_DLLEXPORT jl_datatype_t *jl_float16_type;
 extern JL_DLLEXPORT jl_datatype_t *jl_float32_type;
 extern JL_DLLEXPORT jl_datatype_t *jl_float64_type;
 extern JL_DLLEXPORT jl_datatype_t *jl_floatingpoint_type;