diff --git a/sycl/include/CL/__spirv/spirv_vars.hpp b/sycl/include/CL/__spirv/spirv_vars.hpp
index 74670fa0fb557..f8cff7f39590e 100644
--- a/sycl/include/CL/__spirv/spirv_vars.hpp
+++ b/sycl/include/CL/__spirv/spirv_vars.hpp
@@ -24,6 +24,8 @@ extern "C" const __constant size_t_vec __spirv_BuiltInGlobalOffset;
   template <> size_t get##POSTFIX<1>() { return __spirv_BuiltIn##POSTFIX.y; }  \
   template <> size_t get##POSTFIX<2>() { return __spirv_BuiltIn##POSTFIX.z; }
 
+namespace __spirv {
+
 DEFINE_INT_ID_TO_XYZ_CONVERTER(GlobalSize);
 DEFINE_INT_ID_TO_XYZ_CONVERTER(GlobalInvocationId)
 DEFINE_INT_ID_TO_XYZ_CONVERTER(WorkgroupSize)
@@ -31,6 +33,8 @@ DEFINE_INT_ID_TO_XYZ_CONVERTER(LocalInvocationId)
 DEFINE_INT_ID_TO_XYZ_CONVERTER(WorkgroupId)
 DEFINE_INT_ID_TO_XYZ_CONVERTER(GlobalOffset)
 
+} // namespace __spirv
+
 #undef DEFINE_INT_ID_TO_XYZ_CONVERTER
 
 extern "C" const __constant uint32_t __spirv_BuiltInSubgroupSize;
@@ -40,4 +44,45 @@ extern "C" const __constant uint32_t __spirv_BuiltInNumEnqueuedSubgroups;
 extern "C" const __constant uint32_t __spirv_BuiltInSubgroupId;
 extern "C" const __constant uint32_t __spirv_BuiltInSubgroupLocalInvocationId;
 
+#define DEFINE_INIT_SIZES(POSTFIX)                                             \
+                                                                               \
+  template <int Dim, class DstT> struct InitSizesST##POSTFIX;                  \
+                                                                               \
+  template <class DstT> struct InitSizesST##POSTFIX<1, DstT> {                 \
+    static void initSize(DstT &Dst) {                                          \
+      Dst[0] = get##POSTFIX<0>();                                              \
+    }                                                                          \
+  };                                                                           \
+                                                                               \
+  template <class DstT> struct InitSizesST##POSTFIX<2, DstT> {                 \
+    static void initSize(DstT &Dst) {                                          \
+      Dst[1] = get##POSTFIX<1>();                                              \
+      InitSizesST##POSTFIX<1, DstT>::initSize(Dst);                            \
+    }                                                                          \
+  };                                                                           \
+                                                                               \
+  template <class DstT> struct InitSizesST##POSTFIX<3, DstT> {                 \
+    static void initSize(DstT &Dst) {                                          \
+      Dst[2] = get##POSTFIX<2>();                                              \
+      InitSizesST##POSTFIX<2, DstT>::initSize(Dst);                            \
+    }                                                                          \
+  };                                                                           \
+                                                                               \
+  template <int Dims, class DstT> static void init##POSTFIX(DstT &Dst) {       \
+    InitSizesST##POSTFIX<Dims, DstT>::initSize(Dst);                           \
+  }
+
+namespace __spirv {
+
+DEFINE_INIT_SIZES(GlobalSize);
+DEFINE_INIT_SIZES(GlobalInvocationId)
+DEFINE_INIT_SIZES(WorkgroupSize)
+DEFINE_INIT_SIZES(LocalInvocationId)
+DEFINE_INIT_SIZES(WorkgroupId)
+DEFINE_INIT_SIZES(GlobalOffset)
+
+} // namespace __spirv
+
+#undef DEFINE_INIT_SIZES
+
 #endif // __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 10f71aa8c72d6..62ad16e71cd35 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -67,46 +67,9 @@ namespace csd = cl::sycl::detail;
 template <typename T, int Dimensions, typename AllocatorT> class buffer;
 namespace detail {
 
-#ifdef __SYCL_DEVICE_ONLY__
-
-#define DEFINE_INIT_SIZES(POSTFIX)                                             \
-                                                                               \
-  template <int Dim, class DstT> struct InitSizesST##POSTFIX;                  \
-                                                                               \
-  template <class DstT> struct InitSizesST##POSTFIX<1, DstT> {                 \
-    static void initSize(DstT &Dst) {                                          \
-      Dst[0] = get##POSTFIX<0>();                                 \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <class DstT> struct InitSizesST##POSTFIX<2, DstT> {                 \
-    static void initSize(DstT &Dst) {                                          \
-      Dst[1] = get##POSTFIX<1>();                                 \
-      InitSizesST##POSTFIX<1, DstT>::initSize(Dst);                            \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <class DstT> struct InitSizesST##POSTFIX<3, DstT> {                 \
-    static void initSize(DstT &Dst) {                                          \
-      Dst[2] = get##POSTFIX<2>();                                 \
-      InitSizesST##POSTFIX<2, DstT>::initSize(Dst);                            \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <int Dims, class DstT> static void init##POSTFIX(DstT &Dst) {       \
-    InitSizesST##POSTFIX<Dims, DstT>::initSize(Dst);                           \
-  }
-
-DEFINE_INIT_SIZES(GlobalSize);
-DEFINE_INIT_SIZES(GlobalInvocationId)
-DEFINE_INIT_SIZES(WorkgroupSize)
-DEFINE_INIT_SIZES(LocalInvocationId)
-DEFINE_INIT_SIZES(WorkgroupId)
-DEFINE_INIT_SIZES(GlobalOffset)
-
-#undef DEFINE_INIT_SIZES
-
-#endif //__SYCL_DEVICE_ONLY__
+/// This class is the default KernelName template parameter type for kernel
+/// invocation APIs such as single_task.
+class auto_name {};
 
 class queue_impl;
 class stream_impl;
@@ -129,6 +92,19 @@ decltype(member_ptr_helper(&F::operator())) argument_helper(F);
 
 template <typename T>
 using lambda_arg_type = decltype(argument_helper(std::declval<T>()));
+
+/// Helper struct to get a kernel name type based on given \c Name and \c Type
+/// types: if \c Name is undefined (is a \c auto_name) then \c Type becomes
+/// the \c Name.
+template <typename Name, typename Type> struct get_kernel_name_t {
+  using name = Name;
+};
+
+/// Specialization for the case when \c Name is undefined.
+template <typename Type> struct get_kernel_name_t<csd::auto_name, Type> {
+  using name = Type;
+};
+
 } // namespace detail
 
 // Objects of the handler class collect information about command group, such as
@@ -548,7 +524,7 @@ class handler {
                               KernelType>::type KernelFunc) {
     id<dimensions> global_id;
 
-    detail::initGlobalInvocationId<dimensions>(global_id);
+    __spirv::initGlobalInvocationId<dimensions>(global_id);
 
     KernelFunc(global_id);
   }
@@ -562,8 +538,8 @@ class handler {
     id<dimensions> global_id;
     range<dimensions> global_size;
 
-    detail::initGlobalInvocationId<dimensions>(global_id);
-    detail::initGlobalSize<dimensions>(global_size);
+    __spirv::initGlobalInvocationId<dimensions>(global_id);
+    __spirv::initGlobalSize<dimensions>(global_size);
 
     item<dimensions, false> Item =
         detail::Builder::createItem<dimensions, false>(global_size, global_id);
@@ -583,12 +559,12 @@ class handler {
     id<dimensions> local_id;
     id<dimensions> global_offset;
 
-    detail::initGlobalSize<dimensions>(global_size);
-    detail::initWorkgroupSize<dimensions>(local_size);
-    detail::initWorkgroupId<dimensions>(group_id);
-    detail::initGlobalInvocationId<dimensions>(global_id);
-    detail::initLocalInvocationId<dimensions>(local_id);
-    detail::initGlobalOffset<dimensions>(global_offset);
+    __spirv::initGlobalSize<dimensions>(global_size);
+    __spirv::initWorkgroupSize<dimensions>(local_size);
+    __spirv::initWorkgroupId<dimensions>(group_id);
+    __spirv::initGlobalInvocationId<dimensions>(global_id);
+    __spirv::initLocalInvocationId<dimensions>(local_id);
+    __spirv::initGlobalOffset<dimensions>(global_offset);
 
     group<dimensions> Group = detail::Builder::createGroup<dimensions>(
         global_size, local_size, group_id);
@@ -631,83 +607,62 @@ class handler {
   }
 
   // single_task version with a kernel represented as a lambda.
-  template <typename KernelName, typename KernelType>
+  template <typename KernelName = csd::auto_name, typename KernelType>
   void single_task(KernelType KernelFunc) {
+    using NameT = typename csd::get_kernel_name_t<KernelName, KernelType>::name;
 #ifdef __SYCL_DEVICE_ONLY__
-    kernel_single_task<KernelName>(KernelFunc);
+    kernel_single_task<NameT>(KernelFunc);
 #else
     MNDRDesc.set(range<1>{1});
 
-    StoreLambda<KernelName, KernelType, /*Dims*/ 0, void>(KernelFunc);
+    StoreLambda<NameT, KernelType, /*Dims*/ 0, void>(KernelFunc);
     MCGType = detail::CG::KERNEL;
 #endif
   }
 
-  // single_task version with a kernel represented as a functor. Simply redirect
-  // to the lambda-based form of invocation, setting kernel name type to the
-  // functor type.
-  template <typename KernelFunctorType>
-  void single_task(KernelFunctorType KernelFunctor) {
-    single_task<KernelFunctorType, KernelFunctorType>(KernelFunctor);
-  }
-
   // parallel_for version with a kernel represented as a lambda + range that
   // specifies global size only.
-  template <typename KernelName, typename KernelType, int Dims>
+  template <typename KernelName = csd::auto_name, typename KernelType, int Dims>
   void parallel_for(range<Dims> NumWorkItems, KernelType KernelFunc) {
+    using NameT = typename csd::get_kernel_name_t<KernelName, KernelType>::name;
 #ifdef __SYCL_DEVICE_ONLY__
-    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+    kernel_parallel_for<NameT, KernelType, Dims>(KernelFunc);
 #else
     MNDRDesc.set(std::move(NumWorkItems));
-    StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+    StoreLambda<NameT, KernelType, Dims>(std::move(KernelFunc));
     MCGType = detail::CG::KERNEL;
 #endif
   }
 
-  // parallel_for version with a kernel represented as a functor + range that
-  // specifies global size only. Simply redirect to the lambda-based form of
-  // invocation, setting kernel name type to the functor type.
-  template <typename KernelType, int Dims>
-  void parallel_for(range<Dims> NumWorkItems, KernelType KernelFunc) {
-    parallel_for<KernelType, KernelType, Dims>(NumWorkItems, KernelFunc);
-  }
-
   // parallel_for version with a kernel represented as a lambda + range and
   // offset that specify global size and global offset correspondingly.
-  template <typename KernelName, typename KernelType, int Dims>
+  template <typename KernelName = csd::auto_name, typename KernelType, int Dims>
   void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
                     KernelType KernelFunc) {
+    using NameT = typename csd::get_kernel_name_t<KernelName, KernelType>::name;
 #ifdef __SYCL_DEVICE_ONLY__
-    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+    kernel_parallel_for<NameT, KernelType, Dims>(KernelFunc);
 #else
     MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
-    StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+    StoreLambda<NameT, KernelType, Dims>(std::move(KernelFunc));
     MCGType = detail::CG::KERNEL;
 #endif
   }
 
   // parallel_for version with a kernel represented as a lambda + nd_range that
   // specifies global, local sizes and offset.
-  template <typename KernelName, typename KernelType, int Dims>
+  template <typename KernelName = csd::auto_name, typename KernelType, int Dims>
   void parallel_for(nd_range<Dims> ExecutionRange, KernelType KernelFunc) {
+    using NameT = typename csd::get_kernel_name_t<KernelName, KernelType>::name;
 #ifdef __SYCL_DEVICE_ONLY__
-    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+    kernel_parallel_for<NameT, KernelType, Dims>(KernelFunc);
 #else
     MNDRDesc.set(std::move(ExecutionRange));
-    StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+    StoreLambda<NameT, KernelType, Dims>(std::move(KernelFunc));
     MCGType = detail::CG::KERNEL;
 #endif
   }
 
-  // parallel_for version with a kernel represented as a functor + nd_range that
-  // specifies global, local sizes and offset. Simply redirect to the
-  // lambda-based form of invocation, setting kernel name type to the functor
-  // type.
-  template <typename KernelType, int Dims>
-  void parallel_for(nd_range<Dims> ExecutionRange, KernelType KernelFunc) {
-    parallel_for<KernelType, KernelType, Dims>(ExecutionRange, KernelFunc);
-  }
-
   // template <typename KernelName, typename WorkgroupFunctionType, int
   // dimensions>
   // void parallel_for_work_group(range<dimensions> numWorkGroups,
@@ -773,111 +728,82 @@ class handler {
   // single_task version which takes two "kernels". One is a lambda which is
   // used if device, queue is bound to, is host device. Second is a sycl::kernel
   // which is used otherwise.
-  template <typename KernelName, typename KernelType>
+  template <typename KernelName = csd::auto_name, typename KernelType>
   void single_task(kernel SyclKernel, KernelType KernelFunc) {
+    using NameT = typename csd::get_kernel_name_t<KernelName, KernelType>::name;
 #ifdef __SYCL_DEVICE_ONLY__
-    kernel_single_task<KernelName>(KernelFunc);
+    kernel_single_task<NameT>(KernelFunc);
 #else
     MNDRDesc.set(range<1>{1});
     MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
     MCGType = detail::CG::KERNEL;
-    if (!MIsHost && !lambdaAndKernelHaveEqualName<KernelName>())
+    if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>())
       extractArgsAndReqs();
     else
-      StoreLambda<KernelName, KernelType, /*Dims*/ 0, void>(
-          std::move(KernelFunc));
+      StoreLambda<NameT, KernelType, /*Dims*/ 0, void>(std::move(KernelFunc));
 #endif
   }
 
-  // single_task version which takes two "kernels". One is a functor which is
-  // used if device, queue is bound to, is host device. Second is a sycl::kernel
-  // which is used otherwise. Simply redirect to the lambda-based form of
-  // invocation, setting kernel name type to the functor type.
-  template <typename KernelType>
-  void single_task(kernel SyclKernel, KernelType KernelFunc) {
-    single_task<KernelType, KernelType>(SyclKernel, KernelFunc);
-  }
-
   // parallel_for version which takes two "kernels". One is a lambda which is
   // used if device, queue is bound to, is host device. Second is a sycl::kernel
   // which is used otherwise. range argument specifies global size.
-  template <typename KernelName, typename KernelType, int Dims>
-  void parallel_for(range<Dims> NumWorkItems, kernel SyclKernel,
+  template <typename KernelName = csd::auto_name, typename KernelType, int Dims>
+  void parallel_for(kernel SyclKernel, range<Dims> NumWorkItems,
                     KernelType KernelFunc) {
+    using NameT = typename csd::get_kernel_name_t<KernelName, KernelType>::name;
 #ifdef __SYCL_DEVICE_ONLY__
-    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+    kernel_parallel_for<NameT, KernelType, Dims>(KernelFunc);
 #else
     MNDRDesc.set(std::move(NumWorkItems));
     MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
     MCGType = detail::CG::KERNEL;
-    if (!MIsHost && !lambdaAndKernelHaveEqualName<KernelName>())
+    if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>())
       extractArgsAndReqs();
     else
-      StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+      StoreLambda<NameT, KernelType, Dims>(std::move(KernelFunc));
 #endif
   }
 
-  // parallel_for version which takes two "kernels". One is a functor which is
-  // used if device, queue is bound to, is host device. Second is a sycl::kernel
-  // which is used otherwise. range argument specifies global size. Simply
-  // redirect to the lambda-based form of invocation, setting kernel name type
-  // to the functor type.
-  template <typename KernelType, int Dims>
-  void parallel_for(range<Dims> NumWorkItems, kernel SyclKernel,
-                    KernelType KernelFunc) {
-    parallel_for<KernelType, KernelType, Dims>(NumWorkItems, SyclKernel,
-                                               KernelFunc);
-  }
-
   // parallel_for version which takes two "kernels". One is a lambda which is
   // used if device, queue is bound to, is host device. Second is a sycl::kernel
   // which is used otherwise. range and id specify global size and offset.
-  template <typename KernelName, typename KernelType, int Dims>
-  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
-                    kernel SyclKernel, KernelType KernelFunc) {
+  template <typename KernelName = csd::auto_name, typename KernelType, int Dims>
+  void parallel_for(kernel SyclKernel, range<Dims> NumWorkItems,
+                    id<Dims> WorkItemOffset, KernelType KernelFunc) {
+    using NameT = typename csd::get_kernel_name_t<KernelName, KernelType>::name;
 #ifdef __SYCL_DEVICE_ONLY__
-    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+    kernel_parallel_for<NameT, KernelType, Dims>(KernelFunc);
 #else
     MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
     MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
     MCGType = detail::CG::KERNEL;
-    if (!MIsHost && !lambdaAndKernelHaveEqualName<KernelName>())
+    if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>())
       extractArgsAndReqs();
     else
-      StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+      StoreLambda<NameT, KernelType, Dims>(std::move(KernelFunc));
 #endif
   }
 
   // parallel_for version which takes two "kernels". One is a lambda which is
   // used if device, queue is bound to, is host device. Second is a sycl::kernel
   // which is used otherwise. nd_range specifies global, local size and offset.
-  template <typename KernelName, typename KernelType, int Dims>
-  void parallel_for(nd_range<Dims> NDRange, kernel SyclKernel,
+  template <typename KernelName = csd::auto_name, typename KernelType, int Dims>
+  void parallel_for(kernel SyclKernel, nd_range<Dims> NDRange,
                     KernelType KernelFunc) {
+    using NameT = typename csd::get_kernel_name_t<KernelName, KernelType>::name;
 #ifdef __SYCL_DEVICE_ONLY__
-    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+    kernel_parallel_for<NameT, KernelType, Dims>(KernelFunc);
 #else
     MNDRDesc.set(std::move(NDRange));
     MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
     MCGType = detail::CG::KERNEL;
-    if (!MIsHost && !lambdaAndKernelHaveEqualName<KernelName>())
+    if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>())
       extractArgsAndReqs();
     else
-      StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+      StoreLambda<NameT, KernelType, Dims>(std::move(KernelFunc));
 #endif
   }
 
-  // parallel_for version which takes two "kernels". One is a functor which is
-  // used if device, queue is bound to, is host device. Second is a sycl::kernel
-  // which is used otherwise. nd_range specifies global, local size and offset.
-  // Simply redirects to the lambda-based form of invocation, setting kernel
-  // name type to the functor type.
-  template <typename KernelType, int Dims>
-  void parallel_for(nd_range<Dims> NDRange, kernel SyclKernel,
-                    KernelType KernelFunc) {
-    parallel_for<KernelType, KernelType, Dims>(NDRange, SyclKernel, KernelFunc);
-  }
-
   // template <typename KernelName, typename WorkgroupFunctionType, int
   // dimensions>
   // void parallel_for_work_group(range<dimensions> num_work_groups, kernel
diff --git a/sycl/test/kernel-and-program/kernel-and-program.cpp b/sycl/test/kernel-and-program/kernel-and-program.cpp
index e2deb4f95f23d..f2dd18c18866e 100644
--- a/sycl/test/kernel-and-program/kernel-and-program.cpp
+++ b/sycl/test/kernel-and-program/kernel-and-program.cpp
@@ -149,7 +149,7 @@ int main() {
         q.submit([&](cl::sycl::handler &cgh) {
           auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
           cgh.parallel_for<class ParallelFor>(
-              numOfItems, krn,
+              krn, numOfItems,
               [=](cl::sycl::id<1> wiID) { acc[wiID] = acc[wiID] + 1; });
         });
       }
@@ -233,7 +233,7 @@ int main() {
               localAcc(localRange, cgh);
 
           cgh.parallel_for<class ParallelForND>(
-              cl::sycl::nd_range<1>(numOfItems, localRange), krn,
+              krn, cl::sycl::nd_range<1>(numOfItems, localRange),
               [=](cl::sycl::nd_item<1> item) {
                 size_t idx = item.get_global_linear_id();
                 int pos = idx & 1;