From bc06c6669f26795819a5a4c9df9222946de8b2c5 Mon Sep 17 00:00:00 2001 From: Vijay Dhanraj Date: Fri, 21 Jan 2022 17:44:20 -0800 Subject: [PATCH] [Pal,LibOS] Refactor sysfs to improve sanitization This commit creates a clean representation of topology data on the trusted<->untrusted boundary, so that we can easily verify its correctness. The following are done: 1. Refactor sysfs code to convert CPU/NUMA information from Linux-formatted strings to integers in untrusted PAL. 2. Copy and sanitize CPU/NUMA information based on untrusted integers to trusted PAL. 3. Convert the trusted CPU/NUMA information from integers back to Linux-formatted strings in LibOS. Signed-off-by: Vijay Dhanraj --- LibOS/shim/include/shim_fs_pseudo.h | 13 + LibOS/shim/src/fs/proc/info.c | 8 +- LibOS/shim/src/fs/sys/cache_info.c | 35 +- LibOS/shim/src/fs/sys/cpu_info.c | 38 +- LibOS/shim/src/fs/sys/fs.c | 102 ++- LibOS/shim/src/fs/sys/node_info.c | 36 +- LibOS/shim/src/sys/shim_sched.c | 2 +- Pal/include/arch/x86_64/pal_topology.h | 112 +-- Pal/include/host/Linux-common/topo_info.h | 10 +- Pal/regression/Bootstrap.c | 2 +- Pal/src/host/Linux-SGX/db_main.c | 796 +++++++++++++--------- Pal/src/host/Linux-SGX/enclave_ocalls.c | 2 +- Pal/src/host/Linux-common/topo_info.c | 475 ++++++++----- common/include/api.h | 1 + 14 files changed, 1045 insertions(+), 587 deletions(-) diff --git a/LibOS/shim/include/shim_fs_pseudo.h b/LibOS/shim/include/shim_fs_pseudo.h index 75785626b4..2ca5f23b21 100644 --- a/LibOS/shim/include/shim_fs_pseudo.h +++ b/LibOS/shim/include/shim_fs_pseudo.h @@ -210,4 +210,17 @@ int sys_cache_load(struct shim_dentry* dent, char** out_data, size_t* out_size); bool sys_cpu_online_name_exists(struct shim_dentry* parent, const char* name); int sys_cpu_online_list_names(struct shim_dentry* parent, readdir_callback_t callback, void* arg); +/* Converts struct pal_res_range_info to a string representation. + * Example output when sep == ',': "10-63,68,70-127". + * Note: This function adds a newline at the end of the string. */ +int sys_convert_ranges_to_str(const struct pal_res_range_info* resource_range_info, const char* sep, + char* str, size_t str_size); + +/* Converts struct pal_res_range_info to a sysfs CPU bitmask representation with bitmask size based + * on the possible cores count in the system. + * Example output for 64 cores in total and ranges 0-15,48-55: "00ff0000,0000ffff". + * Note: This function adds a newline at the end of the string. */ +int sys_convert_ranges_to_cpu_bitmap_str(const struct pal_res_range_info* resource_range_info, + char* str, size_t str_size); + #endif /* SHIM_FS_PSEUDO_H_ */ diff --git a/LibOS/shim/src/fs/proc/info.c b/LibOS/shim/src/fs/proc/info.c index 38bc6cf1aa..eb68ead212 100644 --- a/LibOS/shim/src/fs/proc/info.c +++ b/LibOS/shim/src/fs/proc/info.c @@ -142,7 +142,7 @@ int proc_cpuinfo_load(struct shim_dentry* dent, char** out_data, size_t* out_siz const struct pal_topo_info* ti = &g_pal_public_state->topo_info; const struct pal_cpu_info* ci = &g_pal_public_state->cpu_info; - for (size_t i = 0; i < ti->online_logical_cores_cnt; i++) { + for (size_t i = 0; i < ti->online_logical_cores.resource_cnt; i++) { /* Below strings must match exactly the strings retrieved from /proc/cpuinfo * (see Linux's arch/x86/kernel/cpu/proc.c) */ ADD_INFO("processor\t: %lu\n", i); @@ -151,7 +151,9 @@ int proc_cpuinfo_load(struct shim_dentry* dent, char** out_data, size_t* out_siz ADD_INFO("model\t\t: %lu\n", ci->cpu_model); ADD_INFO("model name\t: %s\n", ci->cpu_brand); ADD_INFO("stepping\t: %lu\n", ci->cpu_stepping); - ADD_INFO("physical id\t: %zu\n", ti->cpu_to_socket_arr[i]); + if (g_pal_public_state->enable_sysfs_topology) { + ADD_INFO("physical id\t: %zu\n", ti->core_topo_arr[i].socket_id); + } ADD_INFO("core id\t\t: %lu\n", i); ADD_INFO("cpu cores\t: %zu\n", ti->physical_cores_per_socket); double bogomips = ci->cpu_bogomips; @@ -191,7 +193,7 @@ int proc_stat_load(struct shim_dentry* dent, char** out_data, size_t* out_size) * (see Linux's fs/proc/stat.c) */ ADD_INFO("cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice); - for (size_t n = 0; n < g_pal_public_state->topo_info.online_logical_cores_cnt; n++) { + for (size_t n = 0; n < g_pal_public_state->topo_info.online_logical_cores.resource_cnt; n++) { ADD_INFO("cpu%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", n, user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice); } diff --git a/LibOS/shim/src/fs/sys/cache_info.c b/LibOS/shim/src/fs/sys/cache_info.c index cd15df35b2..3cd2aab852 100644 --- a/LibOS/shim/src/fs/sys/cache_info.c +++ b/LibOS/shim/src/fs/sys/cache_info.c @@ -27,26 +27,41 @@ int sys_cache_load(struct shim_dentry* dent, char** out_data, size_t* out_size) const char* name = dent->name; struct pal_core_cache_info* cache_info = - &g_pal_public_state->topo_info.core_topology_arr[cpu_num].cache_info_arr[cache_num]; - const char* str; + &g_pal_public_state->topo_info.core_topo_arr[cpu_num].cache_info_arr[cache_num]; + char str[PAL_SYSFS_MAP_FILESZ] = {'\0'}; if (strcmp(name, "shared_cpu_map") == 0) { - str = cache_info->shared_cpu_map; + ret = sys_convert_ranges_to_cpu_bitmap_str(&cache_info->shared_cpu_map, str, sizeof(str)); } else if (strcmp(name, "level") == 0) { - str = cache_info->level; + ret = snprintf(str, sizeof(str), "%zu\n", cache_info->level); } else if (strcmp(name, "type") == 0) { - str = cache_info->type; + switch (cache_info->type) { + case CACHE_TYPE_DATA: + ret = snprintf(str, sizeof(str), "Data\n"); + break; + case CACHE_TYPE_INSTRUCTION: + ret = snprintf(str, sizeof(str), "Instruction\n"); + break; + case CACHE_TYPE_UNIFIED: + ret = snprintf(str, sizeof(str), "Unified\n"); + break; + default: + ret = -ENOENT; + } } else if (strcmp(name, "size") == 0) { - str = cache_info->size; + ret = snprintf(str, sizeof(str), "%zuK\n", cache_info->size >> 10); } else if (strcmp(name, "coherency_line_size") == 0) { - str = cache_info->coherency_line_size; + ret = snprintf(str, sizeof(str), "%zu\n", cache_info->coherency_line_size); } else if (strcmp(name, "number_of_sets") == 0) { - str = cache_info->number_of_sets; + ret = snprintf(str, sizeof(str), "%zu\n", cache_info->number_of_sets); } else if (strcmp(name, "physical_line_partition") == 0) { - str = cache_info->physical_line_partition; + snprintf(str, sizeof(str), "%zu\n", cache_info->physical_line_partition); } else { log_debug("unrecognized file: %s", name); - return -ENOENT; + ret = -ENOENT; } + if (ret < 0) + return ret; + return sys_load(str, out_data, out_size); } diff --git a/LibOS/shim/src/fs/sys/cpu_info.c b/LibOS/shim/src/fs/sys/cpu_info.c index 97f98d96e6..477ba92676 100644 --- a/LibOS/shim/src/fs/sys/cpu_info.c +++ b/LibOS/shim/src/fs/sys/cpu_info.c @@ -13,18 +13,24 @@ #include "shim_fs_pseudo.h" int sys_cpu_general_load(struct shim_dentry* dent, char** out_data, size_t* out_size) { + int ret; const char* name = dent->name; - const char* str; + char str[PAL_SYSFS_BUF_FILESZ] = {'\0'}; if (strcmp(name, "online") == 0) { - str = g_pal_public_state->topo_info.online_logical_cores; + ret = sys_convert_ranges_to_str(&g_pal_public_state->topo_info.online_logical_cores, ",", + str, sizeof(str)); } else if (strcmp(name, "possible") == 0) { - str = g_pal_public_state->topo_info.possible_logical_cores; + ret = sys_convert_ranges_to_str(&g_pal_public_state->topo_info.possible_logical_cores, ",", + str, sizeof(str)); } else { log_debug("unrecognized file: %s", name); - return -ENOENT; + ret = -ENOENT; } + if (ret < 0) + return ret; + return sys_load(str, out_data, out_size); } @@ -37,29 +43,30 @@ int sys_cpu_load(struct shim_dentry* dent, char** out_data, size_t* out_size) { const char* name = dent->name; struct pal_core_topo_info* core_topology = - &g_pal_public_state->topo_info.core_topology_arr[cpu_num]; - const char* str; - char buf[12]; + &g_pal_public_state->topo_info.core_topo_arr[cpu_num]; + char str[PAL_SYSFS_MAP_FILESZ] = {'\0'}; if (strcmp(name, "online") == 0) { /* `cpu/cpuX/online` is not present for cpu0 */ if (cpu_num == 0) return -ENOENT; - str = core_topology->is_logical_core_online; + ret = snprintf(str, sizeof(str), "%d\n", core_topology->is_logical_core_online); } else if (strcmp(name, "core_id") == 0) { - str = core_topology->core_id; + ret = snprintf(str, sizeof(str), "%zu\n", core_topology->core_id); } else if (strcmp(name, "physical_package_id") == 0) { - snprintf(buf, sizeof(buf), "%zu\n", - g_pal_public_state->topo_info.cpu_to_socket_arr[cpu_num]); - str = buf; + ret = snprintf(str, sizeof(str), "%zu\n", core_topology->socket_id); } else if (strcmp(name, "core_siblings") == 0) { - str = core_topology->core_siblings; + ret = sys_convert_ranges_to_cpu_bitmap_str(&core_topology->core_siblings, str, sizeof(str)); } else if (strcmp(name, "thread_siblings") == 0) { - str = core_topology->thread_siblings; + ret = sys_convert_ranges_to_cpu_bitmap_str(&core_topology->thread_siblings, str, + sizeof(str)); } else { log_debug("unrecognized file: %s", name); - return -ENOENT; + ret = -ENOENT; } + if (ret < 0) + return ret; + return sys_load(str, out_data, out_size); } @@ -88,5 +95,6 @@ int sys_cpu_online_list_names(struct shim_dentry* parent, readdir_callback_t cal if (ret < 0) return ret; } + return 0; } diff --git a/LibOS/shim/src/fs/sys/fs.c b/LibOS/shim/src/fs/sys/fs.c index a224f328ea..a56b5511e9 100644 --- a/LibOS/shim/src/fs/sys/fs.c +++ b/LibOS/shim/src/fs/sys/fs.c @@ -14,6 +14,104 @@ #include "shim_fs_pseudo.h" #include "stat.h" +int sys_convert_ranges_to_str(const struct pal_res_range_info* resource_range_info, const char* sep, + char* str, size_t str_size) { + size_t ranges_cnt = resource_range_info->ranges_cnt; + if (!ranges_cnt) + return -EINVAL; + + str[0] = '\0'; + size_t offset = 0; + for (size_t i = 0; i < ranges_cnt; i++) { + if (offset >= str_size) + return -ENOMEM; + + int ret; + if (resource_range_info->ranges_arr[i].end == resource_range_info->ranges_arr[i].start) { + ret = snprintf(str + offset, str_size - offset, "%zu%s", + resource_range_info->ranges_arr[i].start, + (i + 1 == ranges_cnt) ? "\n" : sep); + } else { + ret = snprintf(str + offset, str_size - offset, "%zu-%zu%s", + resource_range_info->ranges_arr[i].start, + resource_range_info->ranges_arr[i].end, + (i + 1 == ranges_cnt) ? "\n" : sep); + } + + if (ret < 0) + return ret; + + /* Truncation has occurred */ + if ((size_t)ret >= str_size - offset) + return -EOVERFLOW; + + offset += ret; + } + return 0; +} + +int sys_convert_ranges_to_cpu_bitmap_str(const struct pal_res_range_info* resource_range_info, + char* str, size_t str_size) { + int ret; + + /* Extract cpumask from the ranges */ + size_t possible_logical_cores_cnt = + g_pal_public_state->topo_info.possible_logical_cores.resource_cnt; + size_t cpumask_cnt = BITS_TO_UINT32S(possible_logical_cores_cnt); + assert(cpumask_cnt > 0); + + uint32_t* bitmap = calloc(cpumask_cnt, sizeof(*bitmap)); + if (!bitmap) + return -ENOMEM; + + for (size_t i = 0; i < resource_range_info->ranges_cnt; i++) { + size_t start = resource_range_info->ranges_arr[i].start; + size_t end = resource_range_info->ranges_arr[i].end; + + for (size_t j = start; j <= end; j++) { + size_t index = j / BITS_IN_TYPE(uint32_t); + assert(index < cpumask_cnt); + + bitmap[index] |= 1U << (j % BITS_IN_TYPE(uint32_t)); + } + } + + /* Convert cpumask to strings */ + size_t offset = 0; + for (size_t j = cpumask_cnt; j > 0; j--) { + if (offset >= str_size) { + ret = -ENOMEM; + goto out; + } + + /* Linux doesn't print leading zeroes for systems with less than 32 cores, e.g. "fff" for + * 12 cores; we mimic this behavior. */ + if (possible_logical_cores_cnt >= 32) { + ret = snprintf(str + offset, str_size - offset, "%08x%s", bitmap[j-1], + (j-1 == 0) ? "\n" : ","); + } else { + ret = snprintf(str + offset, str_size - offset, "%x%s", bitmap[j-1], + (j-1 == 0) ? "\n" : ","); + } + + if (ret < 0) + goto out; + + /* Truncation has occurred */ + if ((size_t)ret >= str_size - offset) { + ret = -EOVERFLOW; + goto out; + } + + offset += ret; + } + ret = 0; + +out: + free(bitmap); + return ret; +} + static int sys_resource(struct shim_dentry* parent, const char* name, unsigned int* out_num, readdir_callback_t callback, void* arg) { const char* parent_name = parent->name; @@ -21,10 +119,10 @@ static int sys_resource(struct shim_dentry* parent, const char* name, unsigned i const char* prefix; if (strcmp(parent_name, "node") == 0) { - total = g_pal_public_state->topo_info.online_nodes_cnt; + total = g_pal_public_state->topo_info.online_nodes.resource_cnt; prefix = "node"; } else if (strcmp(parent_name, "cpu") == 0) { - total = g_pal_public_state->topo_info.online_logical_cores_cnt; + total = g_pal_public_state->topo_info.online_logical_cores.resource_cnt; prefix = "cpu"; } else if (strcmp(parent_name, "cache") == 0) { total = g_pal_public_state->topo_info.cache_indices_cnt; diff --git a/LibOS/shim/src/fs/sys/node_info.c b/LibOS/shim/src/fs/sys/node_info.c index b90d66bd59..e6ef76ce36 100644 --- a/LibOS/shim/src/fs/sys/node_info.c +++ b/LibOS/shim/src/fs/sys/node_info.c @@ -12,15 +12,19 @@ #include "shim_fs_pseudo.h" int sys_node_general_load(struct shim_dentry* dent, char** out_data, size_t* out_size) { + int ret; const char* name = dent->name; - const char* str; - if (strcmp(name, "online") == 0) { - str = g_pal_public_state->topo_info.online_nodes; - } else { + if (strcmp(name, "online") != 0) { log_debug("unrecognized file: %s", name); return -ENOENT; } + char str[PAL_SYSFS_BUF_FILESZ] = {'\0'}; + ret = sys_convert_ranges_to_str(&g_pal_public_state->topo_info.online_nodes, ",", str, + sizeof(str)); + if (ret < 0) + return ret; + return sys_load(str, out_data, out_size); } @@ -32,25 +36,29 @@ int sys_node_load(struct shim_dentry* dent, char** out_data, size_t* out_size) { return ret; const char* name = dent->name; - struct pal_numa_topo_info* numa_topology = - &g_pal_public_state->topo_info.numa_topology_arr[node_num]; - const char* str = NULL; + struct pal_numa_topo_info* numa_topo = &g_pal_public_state->topo_info.numa_topo_arr[node_num]; + char str[PAL_SYSFS_MAP_FILESZ] = {'\0'}; if (strcmp(name, "cpumap" ) == 0) { - str = numa_topology->cpumap; + ret = sys_convert_ranges_to_cpu_bitmap_str(&numa_topo->cpumap, str, sizeof(str)); } else if (strcmp(name, "distance") == 0) { - str = numa_topology->distance; + ret = sys_convert_ranges_to_str(&numa_topo->distance, " ", str, sizeof(str)); } else if (strcmp(name, "nr_hugepages") == 0) { const char* parent_name = dent->parent->name; if (strcmp(parent_name, "hugepages-2048kB") == 0) { - str = numa_topology->hugepages[HUGEPAGES_2M].nr_hugepages; + ret = snprintf(str, sizeof(str), "%zu\n", numa_topo->nr_hugepages[HUGEPAGES_2M]); } else if (strcmp(parent_name, "hugepages-1048576kB") == 0) { - str = numa_topology->hugepages[HUGEPAGES_1G].nr_hugepages; + ret = snprintf(str, sizeof(str), "%zu\n", numa_topo->nr_hugepages[HUGEPAGES_1G]); + } else { + log_debug("unrecognized hugepage file: %s", parent_name); + ret = -ENOENT; } - } - if (!str) { + } else { log_debug("unrecognized file: %s", name); - return -ENOENT; + ret = -ENOENT; } + if (ret < 0) + return ret; + return sys_load(str, out_data, out_size); } diff --git a/LibOS/shim/src/sys/shim_sched.c b/LibOS/shim/src/sys/shim_sched.c index 17eaf31e77..019ed0c022 100644 --- a/LibOS/shim/src/sys/shim_sched.c +++ b/LibOS/shim/src/sys/shim_sched.c @@ -175,7 +175,7 @@ long shim_do_sched_setaffinity(pid_t pid, unsigned int cpumask_size, unsigned lo long shim_do_sched_getaffinity(pid_t pid, unsigned int cpumask_size, unsigned long* user_mask_ptr) { int ret; - size_t cpu_cnt = g_pal_public_state->topo_info.online_logical_cores_cnt; + size_t cpu_cnt = g_pal_public_state->topo_info.online_logical_cores.resource_cnt; /* Check if user_mask_ptr is valid */ if (!is_user_memory_writable(user_mask_ptr, cpumask_size)) diff --git a/Pal/include/arch/x86_64/pal_topology.h b/Pal/include/arch/x86_64/pal_topology.h index e44888fbaa..684660c820 100644 --- a/Pal/include/arch/x86_64/pal_topology.h +++ b/Pal/include/arch/x86_64/pal_topology.h @@ -1,83 +1,105 @@ /* SPDX-License-Identifier: LGPL-3.0-or-later */ /* Copyright (C) 2022 Intel Corporation * MichaƂ Kowalczyk + * Vijay Dhanraj */ #ifndef PAL_TOPOLOGY_H #define PAL_TOPOLOGY_H -/* Used to represent plain integers (only numeric values) */ -#define PAL_SYSFS_INT_FILESZ 16 -/* Used to represent buffers having numeric values with text. E.g "1024576K" */ +#include + +/* Used to represent buffers having numeric values and unit suffixes if present, e.g. "1024576K". + * NOTE: Used to allocate on stack; increase with caution or use malloc instead. */ #define PAL_SYSFS_BUF_FILESZ 64 -/* Used to represent cpumaps like "00000000,ffffffff,00000000,ffffffff" */ +/* Used to represent cpumaps like "00000000,ffffffff,00000000,ffffffff". + * NOTE: Used to allocate on stack; increase with caution or use malloc instead. */ #define PAL_SYSFS_MAP_FILESZ 256 +/* Used to represent length of file/directory paths. + * NOTE: Used to allocate on stack; increase with caution or use malloc instead. */ +#define PAL_SYSFS_PATH_SIZE 128 + +#define MAX_HYPERTHREADS_PER_CORE 4 +#define MAX_CACHE_LEVELS 3 + enum { HUGEPAGES_2M = 0, HUGEPAGES_1G, HUGEPAGES_MAX, }; +enum cache_type { + CACHE_TYPE_DATA, + CACHE_TYPE_INSTRUCTION, + CACHE_TYPE_UNIFIED, +}; + +/* `start` and `end` are inclusive */ +struct pal_range_info { + size_t start; + size_t end; +}; + +struct pal_res_range_info { + /* Total number of resources present. E.g. if output of `/sys/devices/system/cpu/online` was + * 0-15,21,32-63 then `resource_cnt` will be 49 */ + size_t resource_cnt; + + /* Total number of ranges present. E.g. if output of `/sys/devices/system/cpu/online` was + * 0-15,21,32-63 then `ranges_cnt` will be 3 */ + size_t ranges_cnt; + + /* Array of ranges, with `ranges_cnt` items. E.g. if output of `/sys/devices/system/cpu/online` + * was 0-12,16-30,31 then `ranges_arr` will be [{0, 12}, {16, 30}, {31, 31}]. + * Note: The ranges should not overlap */ + struct pal_range_info* ranges_arr; +}; + struct pal_core_cache_info { - char shared_cpu_map[PAL_SYSFS_MAP_FILESZ]; - char level[PAL_SYSFS_INT_FILESZ]; - char type[PAL_SYSFS_BUF_FILESZ]; - char size[PAL_SYSFS_BUF_FILESZ]; - char coherency_line_size[PAL_SYSFS_INT_FILESZ]; - char number_of_sets[PAL_SYSFS_INT_FILESZ]; - char physical_line_partition[PAL_SYSFS_INT_FILESZ]; + struct pal_res_range_info shared_cpu_map; + size_t level; + enum cache_type type; + size_t size; + size_t coherency_line_size; + size_t number_of_sets; + size_t physical_line_partition; }; struct pal_core_topo_info { - /* [0] element is uninitialized because core 0 is always online */ - char is_logical_core_online[PAL_SYSFS_INT_FILESZ]; - char core_id[PAL_SYSFS_INT_FILESZ]; - char core_siblings[PAL_SYSFS_MAP_FILESZ]; - char thread_siblings[PAL_SYSFS_MAP_FILESZ]; + bool is_logical_core_online; + size_t core_id; + /* Socket (physical package) where the core is present */ + size_t socket_id; + struct pal_res_range_info core_siblings; + struct pal_res_range_info thread_siblings; /* Array of size cache_indices_cnt, owned by this struct */ struct pal_core_cache_info* cache_info_arr; }; -struct pal_numa_hugepage_info { - char nr_hugepages[PAL_SYSFS_INT_FILESZ]; -}; - struct pal_numa_topo_info { - char cpumap[PAL_SYSFS_MAP_FILESZ]; - char distance[PAL_SYSFS_BUF_FILESZ]; - struct pal_numa_hugepage_info hugepages[HUGEPAGES_MAX]; + struct pal_res_range_info cpumap; + struct pal_res_range_info distance; + size_t nr_hugepages[HUGEPAGES_MAX]; }; struct pal_topo_info { - /* Number of logical cores available on the host. */ - size_t online_logical_cores_cnt; - - char online_logical_cores[PAL_SYSFS_BUF_FILESZ]; - - /* Array of "logical core -> socket" mappings; has online_logical_cores_cnt elements. */ - size_t* cpu_to_socket_arr; + struct pal_res_range_info possible_logical_cores; - /* Array of logical core topology info, owned by this struct. Has online_logical_cores_cnt - * elements. */ - struct pal_core_topo_info* core_topology_arr; + struct pal_res_range_info online_logical_cores; + /* Array of logical core topology info, owned by this struct. + * Has online_logical_cores.resource_cnt elements. */ + struct pal_core_topo_info* core_topo_arr; - /* Max number of logical cores available on the host. */ - size_t possible_logical_cores_cnt; - - char possible_logical_cores[PAL_SYSFS_BUF_FILESZ]; + struct pal_res_range_info online_nodes; + /* Array of numa topology info, owned by this struct. Has online_nodes.resource_cnt elements. */ + struct pal_numa_topo_info* numa_topo_arr; + /* Number of physical packages in the system */ + size_t sockets_cnt; /* Number of physical cores in a socket (physical package). */ size_t physical_cores_per_socket; - /* Number of nodes available on the host. */ - size_t online_nodes_cnt; - - char online_nodes[PAL_SYSFS_BUF_FILESZ]; - - /* Array of numa topology info, owned by this struct. Has online_nodes_cnt elements. */ - struct pal_numa_topo_info* numa_topology_arr; - /* Number of cache levels (such as L2 or L3) available on the host. */ size_t cache_indices_cnt; }; diff --git a/Pal/include/host/Linux-common/topo_info.h b/Pal/include/host/Linux-common/topo_info.h index 6368b3d0ab..c56ed9d7b8 100644 --- a/Pal/include/host/Linux-common/topo_info.h +++ b/Pal/include/host/Linux-common/topo_info.h @@ -8,19 +8,11 @@ #include "pal.h" -/* Opens a pseudo-file describing HW resources such as online CPUs and counts the number of - * HW resources present in the file (if count == true) or simply reads the integer stored in the - * file (if count == false). For example on a single-core machine, calling this function on - * `/sys/devices/system/cpu/online` with count == true will return 1 and 0 with count == false. - * Returns UNIX error code on failure. - * N.B: Understands complex formats like "1,3-5,6" when called with count == true. - */ -int get_hw_resource(const char* filename, bool count); - /* Reads up to count bytes from the file into the buf passed. * Returns 0 or number of bytes read on success and UNIX error code on failure. */ ssize_t read_file_buffer(const char* filename, char* buf, size_t count); + /* Fills topo_info with CPU and NUMA topology from the host */ int get_topology_info(struct pal_topo_info* topo_info); diff --git a/Pal/regression/Bootstrap.c b/Pal/regression/Bootstrap.c index a3926d7432..712a0d9d6c 100644 --- a/Pal/regression/Bootstrap.c +++ b/Pal/regression/Bootstrap.c @@ -36,7 +36,7 @@ int main(int argc, char** argv, char** envp) { pal_printf("User Address Range OK\n"); const struct pal_cpu_info* ci = &pal_public_state->cpu_info; - pal_printf("CPU num: %zu\n", pal_public_state->topo_info.online_logical_cores_cnt); + pal_printf("CPU num: %zu\n", pal_public_state->topo_info.online_logical_cores.resource_cnt); pal_printf("CPU vendor: %s\n", ci->cpu_vendor); pal_printf("CPU brand: %s\n", ci->cpu_brand); pal_printf("CPU family: %ld\n", ci->cpu_family); diff --git a/Pal/src/host/Linux-SGX/db_main.c b/Pal/src/host/Linux-SGX/db_main.c index 98497a2a1d..bdfb30391f 100644 --- a/Pal/src/host/Linux-SGX/db_main.c +++ b/Pal/src/host/Linux-SGX/db_main.c @@ -70,6 +70,7 @@ void _DkGetAvailableUserAddressRange(PAL_PTR* start, PAL_PTR* end) { * to free it (For argv and envp we rely on auto free on termination in * practice). */ +/* This function doesn't clean up resources on failure as we terminate the process anyway. */ static const char** make_argv_list(void* uptr_src, size_t src_size) { const char** argv; @@ -123,243 +124,491 @@ static const char** make_argv_list(void* uptr_src, size_t src_size) { return NULL; } -/* This function extracts first positive integer present in the buffer. For example 31 will be - * returned when input "31" is provided. If buffer contains valid size indicators such as "48K", - * then just numeric value (48 in this case) is returned. Returns negative unix error code if the - * buffer is malformed E.g., "20abc" or "3,4,5" or "xyz123" or "512H". - * Use case: To extract integer from /sys/devices/system/cpu/cpuX/cache/index0/size path. */ -static int extract_size_t_from_buffer(const char* buf, size_t* out_value) { - const char* end = NULL; - unsigned long intval; +/* This function doesn't clean up resources on failure as we terminate the process anyway. */ +static int copy_resource_range_to_enclave(struct pal_res_range_info* src, + struct pal_res_range_info* dest) { + size_t ranges_arr_size; + if (__builtin_mul_overflow(src->ranges_cnt, sizeof(struct pal_range_info), &ranges_arr_size)) { + log_error("Overflow detected with size of ranges_arr memory allocation request"); + return -1; + } + + struct pal_range_info* ranges_arr = malloc(ranges_arr_size); + if (!ranges_arr) { + log_error("Range allocation failed"); + return -1; + } + + /* Even though `src` points to a safe in-enclave object, the `src->ranges_arr` pointer is + * untrusted and may maliciously point inside the enclave; thus need to use + * `sgx_copy_to_enclave()` function */ + if (!sgx_copy_to_enclave(ranges_arr, ranges_arr_size, + src->ranges_arr, src->ranges_cnt * sizeof(*src->ranges_arr))) { + log_error("Copying ranges into the enclave failed"); + return -1; + } - while (*buf == ' ' || *buf == '\t') - buf++; + dest->ranges_arr = ranges_arr; + dest->ranges_cnt = src->ranges_cnt; + dest->resource_cnt = src->resource_cnt; + return 0; +} - /* Intentionally using unsigned long to adapt for variable bitness. */ - if (str_to_ulong(buf, 10, &intval, &end) < 0) - return -EINVAL; +/* This function doesn't clean up resources on failure as we terminate the process anyway. */ +static int sgx_copy_core_topo_to_enclave(struct pal_core_topo_info* uptr_src, + size_t online_logical_cores_cnt, + size_t cache_indices_cnt, + struct pal_core_topo_info** out_core_topo_arr) { + assert(out_core_topo_arr); + + struct pal_core_topo_info* temp_core_topo_arr = + malloc(online_logical_cores_cnt * sizeof(*temp_core_topo_arr)); + if (!temp_core_topo_arr) { + log_error("Allocation for shallow copy of core_topo_arr failed"); + return -1; + } - if (end[0] != '\0') { - if (end[0] != '\n' && end[0] != 'K' && end[0] != 'M' && end[0] != 'G') - return -EINVAL; + /* Shallow copy contents of core_topo_arr (uptr_src) into enclave */ + if (!sgx_copy_to_enclave(temp_core_topo_arr, + online_logical_cores_cnt * sizeof(*temp_core_topo_arr), uptr_src, + online_logical_cores_cnt * sizeof(*uptr_src))) { + log_error("Shallow copy of core_topo_arr into the enclave failed"); + return -1; + } - end += 1; - if (end[0] != '\0' && end[0] != '\n' && end[1] != '\0') - return -EINVAL; + /* Allocate enclave memory to store core topo info */ + struct pal_core_topo_info* core_topo_arr = + malloc(online_logical_cores_cnt * sizeof(*core_topo_arr)); + if (!core_topo_arr) { + log_error("Allocation for core topology array failed"); + return -1; } - *out_value = (size_t)intval; + + for (size_t idx = 0; idx < online_logical_cores_cnt; idx++) { + core_topo_arr[idx].is_logical_core_online = + temp_core_topo_arr[idx].is_logical_core_online; + core_topo_arr[idx].core_id = temp_core_topo_arr[idx].core_id; + core_topo_arr[idx].socket_id = temp_core_topo_arr[idx].socket_id; + + int ret = copy_resource_range_to_enclave(&temp_core_topo_arr[idx].core_siblings, + &core_topo_arr[idx].core_siblings); + if (ret < 0) { + log_error("Copying core_topo_arr[%zu].core_siblings failed", idx); + return -1; + } + + ret = copy_resource_range_to_enclave(&temp_core_topo_arr[idx].thread_siblings, + &core_topo_arr[idx].thread_siblings); + if (ret < 0) { + log_error("Copying core_topo_arr[%zu].thread_siblings failed", idx); + return -1; + } + + /* Shallow copy contents of cache_info_arr (untrusted pointer) into enclave */ + struct pal_core_cache_info* temp_cache_info_arr = + malloc(cache_indices_cnt * sizeof(*temp_cache_info_arr)); + if (!temp_cache_info_arr) { + log_error("Allocation for shallow copy of cache_info_arr failed"); + return -1; + } + + if (!sgx_copy_to_enclave(temp_cache_info_arr, + cache_indices_cnt * sizeof(*temp_cache_info_arr), + temp_core_topo_arr->cache_info_arr, + cache_indices_cnt * + sizeof(*temp_core_topo_arr->cache_info_arr))) { + log_error("Shallow copy of cache_info_arr into the enclave failed"); + return -1; + } + + /* Allocate enclave memory to store cache info */ + struct pal_core_cache_info* cache_info_arr = + malloc(cache_indices_cnt * sizeof(*cache_info_arr)); + if (!cache_info_arr) { + log_error("Allocation for cache_info_arr failed"); + return -1; + } + + for (size_t lvl = 0; lvl < cache_indices_cnt; lvl++) { + cache_info_arr[lvl].level = temp_cache_info_arr[lvl].level; + cache_info_arr[lvl].type = temp_cache_info_arr[lvl].type; + cache_info_arr[lvl].size = temp_cache_info_arr[lvl].size; + cache_info_arr[lvl].coherency_line_size = temp_cache_info_arr[lvl].coherency_line_size; + cache_info_arr[lvl].number_of_sets = temp_cache_info_arr[lvl].number_of_sets; + cache_info_arr[lvl].physical_line_partition = + temp_cache_info_arr[lvl].physical_line_partition; + + ret = copy_resource_range_to_enclave(&temp_cache_info_arr[lvl].shared_cpu_map, + &cache_info_arr[lvl].shared_cpu_map); + if (ret < 0) { + log_error("Copying cache_info_arr[%zu].shared_cpu_map failed", lvl); + return -1; + } + } + + core_topo_arr[idx].cache_info_arr = cache_info_arr; + free(temp_cache_info_arr); + } + + *out_core_topo_arr = core_topo_arr; + + free(temp_core_topo_arr); return 0; } -/* This function counts bits set in buffer. For example 2 will be returned when input buffer - * "00000000,80000000,00000000,80000000" is provided. Returns negative UNIX error code on error and - * actual count on success. - * Use case: To count bits set in /sys/devices/system/cpu/cpu95/topology/core_siblings bitmaps. */ -static int count_bits_set_from_resource_map(const char* buf, size_t* out_bits_set) { - size_t count = 0; - unsigned long bitmap; - while (*buf) { - while (*buf == ' ' || *buf == '\t' || *buf == ',' || *buf == '\n') - buf++; +/* This function doesn't clean up resources on failure as we terminate the process anyway. */ +static int sgx_copy_numa_topo_to_enclave(struct pal_numa_topo_info* uptr_src, + size_t online_nodes_cnt, + struct pal_numa_topo_info** out_numa_topo_arr) { + assert(out_numa_topo_arr); - if (*buf == '\0') - break; + struct pal_numa_topo_info* temp_numa_topo_arr = + malloc(online_nodes_cnt * sizeof(*temp_numa_topo_arr)); + if (!temp_numa_topo_arr) { + log_error("Allocation for shallow copy of numa_topo_arr failed"); + return -1; + } - const char* end = NULL; - /* Linux uses different bitmap size depending on the host arch. We intentionally use - * unsigned long to adapt for this variable bitness. */ - if (str_to_ulong(buf, 16, &bitmap, &end) < 0) - return -EINVAL; + /* Shallow copy contents of numa_topo_arr (uptr_src) into enclave */ + if (!sgx_copy_to_enclave(temp_numa_topo_arr, + online_nodes_cnt * sizeof(*temp_numa_topo_arr), uptr_src, + online_nodes_cnt * sizeof(*uptr_src))) { + log_error("Shallow copy of numa_topo_arr into the enclave failed"); + return -1; + } - if (*end != '\0' && *end != ',' && *end != '\n') - return -EINVAL; + struct pal_numa_topo_info* numa_topo_arr = malloc(online_nodes_cnt * sizeof(*numa_topo_arr)); + if (!numa_topo_arr) { + log_error("Allocation for numa_topo_arr failed"); + return -1; + } - if (__builtin_add_overflow(count, count_ulong_bits_set(bitmap), &count)) - return -EINVAL; + for (size_t idx = 0; idx < online_nodes_cnt; idx++) { + numa_topo_arr[idx].nr_hugepages[HUGEPAGES_2M] = + temp_numa_topo_arr[idx].nr_hugepages[HUGEPAGES_2M]; + numa_topo_arr[idx].nr_hugepages[HUGEPAGES_1G] = + temp_numa_topo_arr[idx].nr_hugepages[HUGEPAGES_1G]; + + int ret = copy_resource_range_to_enclave(&temp_numa_topo_arr[idx].cpumap, + &numa_topo_arr[idx].cpumap); + if (ret < 0) { + log_error("Copying numa_topo_arr[%zu].cpumap failed", idx); + return -1; + } - buf = end; + ret = copy_resource_range_to_enclave(&temp_numa_topo_arr[idx].distance, + &numa_topo_arr[idx].distance); + if (ret < 0) { + log_error("Copying numa_topo_arr[%zu].distance failed", idx); + return -1; + } } - *out_bits_set = count; + + *out_numa_topo_arr = numa_topo_arr; + + free(temp_numa_topo_arr); return 0; } -/* This function counts number of hw resources present in buffer. There are 2 options available, - * 1) ordered == true, which ensures that buffer doesn't have overlapping range like "1-5,3-4" or - * malformed like "1-5,7-1". - * 2) ordered == false which simply counts the range of numbers. For example "1-5, 3-4, 7-1" will - * return 14 as count. - * Returns negative unix error if buf is empty or contains invalid data and number of hw resources - * present in the buffer on success. */ -static long sanitize_hw_resource_count(const char* buf, bool ordered) { - bool init_done = false; - unsigned long current_maxint = 0; - unsigned long resource_cnt = 0; - while (*buf) { - while (*buf == ' ' || *buf == '\t' || *buf == ',' || *buf == '\n') - buf++; - - if (*buf == '\0') - break; - - const char* end = NULL; - unsigned long firstint; - /* Intentionally using unsigned long to adapt for variable bitness. */ - if (str_to_ulong(buf, 10, &firstint, &end) < 0 || firstint > LONG_MAX) - return -EINVAL; - - if (ordered) { - if (init_done && firstint <= current_maxint) - return -EINVAL; - current_maxint = firstint; - init_done = true; +/* This function does the following 3 sanitizations for a given resource range: + * 1. Ensures the resource as well as range count doesn't exceed limits. + * 2. Ensures that ranges don't overlap like "1-5, 3-4". + * 3. Ensures the ranges aren't malformed like "1-5, 7-1". + * Returns -1 error on failure and 0 on success. + */ +static int sanitize_hw_resource_range(struct pal_res_range_info* res_info, size_t res_min_limit, + size_t res_max_limit, size_t range_min_limit, + size_t range_max_limit) { + size_t resource_cnt = res_info->resource_cnt; + if (!IS_IN_RANGE_INCL(resource_cnt, res_min_limit, res_max_limit)) { + log_error("Invalid resource count: %zu", resource_cnt); + return -1; + } + + size_t ranges_cnt = res_info->ranges_cnt; + if (!IS_IN_RANGE_INCL(ranges_cnt, 1, 1 << 7)) { + log_error("Invalid range count: %zu", ranges_cnt); + return -1; + } + + if (!res_info->ranges_arr) + return -1; + + bool check_for_overlaps = false; + size_t previous_end = 0; + size_t resource_cnt_from_ranges = 0; + for (size_t i = 0; i < ranges_cnt; i++) { + + size_t start = res_info->ranges_arr[i].start; + size_t end = res_info->ranges_arr[i].end; + + /* Ensure start and end fall within range limits */ + if (!IS_IN_RANGE_INCL(start, range_min_limit, range_max_limit)) { + log_error("Invalid start of range: %zu", start); + return -1; } - /* count the number of HW resources */ - if (*end == '\0' || *end == ',' || *end == '\n' || *end == ' ') { - /* single HW resource index, count as one more */ - resource_cnt++; - } else if (*end == '-') { - /* HW resource range, count how many HW resources are in range */ - buf = end + 1; - unsigned long secondint; - if (str_to_ulong(buf, 10, &secondint, &end) < 0 || secondint > LONG_MAX) - return -EINVAL; - - unsigned long diff; - if (secondint > firstint) { - if (ordered) - current_maxint = secondint; - - diff = secondint - firstint; - if (diff >= LONG_MAX || resource_cnt + diff + 1 > LONG_MAX) - return -EINVAL; - resource_cnt += diff + 1; /* inclusive (e.g. 0-7) */ - } else { - diff = firstint - secondint; - if (ordered || diff >= LONG_MAX || resource_cnt + diff + 1 > LONG_MAX) - return -EINVAL; - resource_cnt += diff + 1; - } + if ((start != end) && !IS_IN_RANGE_INCL(end, start + 1, range_max_limit)) { + log_error("Invalid end of range: %zu", end); + return -1; + } + + resource_cnt_from_ranges += end - start + 1; + + /* check for overlaps like "1-5, 3-4". Note: we skip this check for first time as + *`previous_end` is not yet initialized. */ + if (check_for_overlaps && previous_end >= start) { + log_error("Overlapping ranges: previous_end = %zu, current start = %zu", previous_end, + start); + return -1; } - buf = end; + previous_end = end; + + /* Start checking for overlaps after the first range */ + check_for_overlaps = true; + } + + if (resource_cnt_from_ranges != resource_cnt) { + log_error("Mismatch between resource_cnt and resource_cnt_from_ranges"); + return -1; } - return (long)resource_cnt ?: -EINVAL; + + return 0; } static int sanitize_cache_topology_info(struct pal_core_cache_info* cache_info_arr, - size_t cache_indices_cnt, size_t cores_cnt) { - int ret; - for (size_t idx = 0; idx < cache_indices_cnt; idx++) { - size_t shared_cpu_map; - ret = count_bits_set_from_resource_map(cache_info_arr[idx].shared_cpu_map, &shared_cpu_map); - if (ret < 0 || !IS_IN_RANGE_INCL(shared_cpu_map, 1, cores_cnt)) - return -EINVAL; - - size_t level; - ret = extract_size_t_from_buffer(cache_info_arr[idx].level, &level); - if (ret < 0 || !IS_IN_RANGE_INCL(level, 1, 3)) /* x86 processors have max of 3 cache levels */ - return -EINVAL; - - char* type = cache_info_arr[idx].type; - if (!strstartswith(type, "Data") && !strstartswith(type, "Instruction") && - !strstartswith(type, "Unified")) { - return -EINVAL; + size_t online_logical_cores_cnt, size_t cache_indices_cnt) { + for (size_t lvl = 0; lvl < cache_indices_cnt; lvl++) { + if (cache_info_arr[lvl].type != CACHE_TYPE_DATA && + cache_info_arr[lvl].type != CACHE_TYPE_INSTRUCTION && + cache_info_arr[lvl].type != CACHE_TYPE_UNIFIED) { + return -1; } - size_t size; - ret = extract_size_t_from_buffer(cache_info_arr[idx].size, &size); - if (ret < 0 || !IS_IN_RANGE_INCL(size, 1, 1 << 30)) - return -EINVAL; + size_t max_limit; + if (cache_info_arr[lvl].type == CACHE_TYPE_DATA || + cache_info_arr[lvl].type == CACHE_TYPE_INSTRUCTION) { + /* Taking HT into account */ + max_limit = MAX_HYPERTHREADS_PER_CORE; + } else { + /* if unified cache then it can range up to total number of cores. */ + max_limit = online_logical_cores_cnt; + } - size_t coherency_line_size; - ret = extract_size_t_from_buffer(cache_info_arr[idx].coherency_line_size, &coherency_line_size); - if (ret < 0 || !IS_IN_RANGE_INCL(coherency_line_size, 1, 1 << 16)) - return -EINVAL; + /* Recall that `shared_cpu_map` shows this core + its siblings (if HT is enabled), for + * example: /sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_map: 00000000,00000002 */ + int ret = sanitize_hw_resource_range(&cache_info_arr[lvl].shared_cpu_map, 1, max_limit, 0, + online_logical_cores_cnt); + if (ret < 0) { + log_error("Invalid cache[%zu].shared_cpu_map", lvl); + return -1; + } + + if (!IS_IN_RANGE_INCL(cache_info_arr[lvl].level, 1, MAX_CACHE_LEVELS)) + return -1; - size_t number_of_sets; - ret = extract_size_t_from_buffer(cache_info_arr[idx].number_of_sets, &number_of_sets); - if (ret < 0 || !IS_IN_RANGE_INCL(number_of_sets, 1, 1 << 30)) - return -EINVAL; + if (!IS_IN_RANGE_INCL(cache_info_arr[lvl].size, 1, 1 << 30)) + return -1; - size_t physical_line_partition; - ret = extract_size_t_from_buffer(cache_info_arr[idx].physical_line_partition, - &physical_line_partition); - if (ret < 0 || !IS_IN_RANGE_INCL(physical_line_partition, 1, 1 << 16)) - return -EINVAL; + if (!IS_IN_RANGE_INCL(cache_info_arr[lvl].coherency_line_size, 1, 1 << 16)) + return -1; + + if (!IS_IN_RANGE_INCL(cache_info_arr[lvl].number_of_sets, 1, 1 << 30)) + return -1; + + if (!IS_IN_RANGE_INCL(cache_info_arr[lvl].physical_line_partition, 1, 1 << 16)) + return -1; } return 0; } -static int sanitize_core_topology_info(struct pal_core_topo_info* core_topology_arr, - size_t cores_cnt, size_t cache_indices_cnt) { - int ret; - if (cores_cnt == 0 || cache_indices_cnt == 0) - return -ENOENT; - - for (size_t idx = 0; idx < cores_cnt; idx++) { - if (idx != 0) { /* core 0 is always online */ - size_t is_core_online; - ret = extract_size_t_from_buffer(core_topology_arr[idx].is_logical_core_online, - &is_core_online); - if (ret < 0 || (is_core_online != 0 && is_core_online != 1)) - return -EINVAL; +/* For each socket, cross-verify that its set of cores is the same as the core topology's + * core-siblings: + * - Pick the first core in the socket. + * - Find its core-siblings in the core topology. + * - Verify that the "cores in the socket info" array is exactly the same as "core-siblings + * present in core topology" array. + */ +static int sanitize_socket_info(struct pal_core_topo_info* core_topo_arr, + struct pal_res_range_info* socket_info_arr, size_t sockets_cnt) { + for (size_t idx = 0; idx < sockets_cnt; idx++) { + if (!socket_info_arr[idx].ranges_cnt || !socket_info_arr[idx].ranges_arr) { + return -1; } - size_t core_id; - ret = extract_size_t_from_buffer(core_topology_arr[idx].core_id, &core_id); - if (ret < 0 || core_id >= cores_cnt) - return -EINVAL; - - size_t core_siblings_cnt; - ret = count_bits_set_from_resource_map(core_topology_arr[idx].core_siblings, - &core_siblings_cnt); - if (ret < 0 || !IS_IN_RANGE_INCL(core_siblings_cnt, 1, cores_cnt)) - return -EINVAL; + size_t core_in_socket = socket_info_arr[idx].ranges_arr[0].start; + struct pal_res_range_info* core_siblings = &core_topo_arr[core_in_socket].core_siblings; - size_t thread_siblings_cnt; - ret = count_bits_set_from_resource_map(core_topology_arr[idx].thread_siblings, - &thread_siblings_cnt); - /* x86 processors have max of 4 SMT siblings */ - if (ret < 0 || !IS_IN_RANGE_INCL(thread_siblings_cnt, 1, 4)) - return -EINVAL; + if (core_siblings->ranges_cnt != socket_info_arr[idx].ranges_cnt) { + return -1; + } - if (sanitize_cache_topology_info(core_topology_arr[idx].cache_info_arr, cache_indices_cnt, - cores_cnt) < 0) - return -EINVAL; + for (size_t j = 0; j < core_siblings->ranges_cnt; j++) { + if (socket_info_arr[idx].ranges_arr[j].start != core_siblings->ranges_arr[j].start || + socket_info_arr[idx].ranges_arr[j].end != core_siblings->ranges_arr[j].end) { + return -1; + } + } } + return 0; } -static int sanitize_socket_info(size_t* cpu_to_socket_arr, size_t cores_cnt) { - if (cores_cnt == 0) - return -ENOENT; +/* This function doesn't clean up resources on failure as we terminate the process anyway. */ +static int sanitize_core_topology_info(struct pal_core_topo_info* core_topo_arr, + size_t online_logical_cores_cnt, size_t cache_indices_cnt, + size_t sockets_cnt) { + int ret; + + struct pal_res_range_info* socket_info_arr = calloc(sockets_cnt, sizeof(*socket_info_arr)); + if (!socket_info_arr) + return -1; + + for (size_t idx = 0; idx < online_logical_cores_cnt; idx++) { + if (core_topo_arr[idx].core_id > online_logical_cores_cnt - 1) { + ret = -1; + goto out; + } + + ret = sanitize_hw_resource_range(&core_topo_arr[idx].core_siblings, 1, + online_logical_cores_cnt, 0, online_logical_cores_cnt); + if (ret < 0) { + log_error("Invalid core_topo_arr[%zu].core_siblings", idx); + goto out; + } + + /* Max. SMT siblings currently supported on x86 processors is 4 */ + ret = sanitize_hw_resource_range(&core_topo_arr[idx].thread_siblings, 1, + MAX_HYPERTHREADS_PER_CORE, 0, online_logical_cores_cnt); + if (ret < 0) { + log_error("Invalid core_topo_arr[%zu].thread_siblings", idx); + goto out; + } + + ret = sanitize_cache_topology_info(core_topo_arr[idx].cache_info_arr, + online_logical_cores_cnt, cache_indices_cnt); + if (ret < 0) { + log_error("Invalid core_topo_arr[%zu].cache_info_arr", idx); + goto out; + } + + /* To sanitize the socket, there are 2 steps: + * #1. From the socket_id of each core, create a range of cores present in each socket. + * #2. Compare array of cores in each socket against the array of core-siblings from + * the core topology. + */ + size_t socket_id = core_topo_arr[idx].socket_id; + if (socket_id > sockets_cnt - 1) { + ret = -1; + goto out; + } + + /* Step #1 */ + static size_t prev_socket_id = UINT32_MAX; + if (socket_id != prev_socket_id) { + socket_info_arr[socket_id].ranges_cnt++; + size_t new_size = sizeof(struct pal_range_info) * socket_info_arr[socket_id].ranges_cnt; + size_t old_size = new_size - sizeof(struct pal_range_info); + /* TODO: Optimize realloc by doing some overestimation and trimming later once the + * range count is known */ + struct pal_range_info* tmp = malloc(new_size); + if (!tmp) { + ret = -1; + goto out; + } + + if (socket_info_arr[socket_id].ranges_arr) { + memcpy(tmp, socket_info_arr[socket_id].ranges_arr, old_size); + free(socket_info_arr[socket_id].ranges_arr); + } + socket_info_arr[socket_id].ranges_arr = tmp; + + size_t range_idx = socket_info_arr[socket_id].ranges_cnt - 1; + socket_info_arr[socket_id].ranges_arr[range_idx].start = idx; + socket_info_arr[socket_id].ranges_arr[range_idx].end = idx; + prev_socket_id = socket_id; + } else { + size_t range_idx = socket_info_arr[socket_id].ranges_cnt - 1; + socket_info_arr[socket_id].ranges_arr[range_idx].end = idx; + } + } + + /* Step #2 */ + ret = sanitize_socket_info(core_topo_arr, socket_info_arr, sockets_cnt); + if (ret < 0) + goto out; - for (size_t idx = 0; idx < cores_cnt; idx++) { - /* Virtual environments such as QEMU may assign each core to a separate socket/package with - * one or more NUMA nodes. So we check against the number of online logical cores. */ - if (cpu_to_socket_arr[idx] >= cores_cnt) - return -EINVAL; + ret = 0; +out: + for (size_t i = 0; i < sockets_cnt; i++) { + if (socket_info_arr[i].ranges_arr) + free(socket_info_arr[i].ranges_arr); } - return 0; + free(socket_info_arr); + return ret; } -static int sanitize_numa_topology_info(struct pal_numa_topo_info* numa_topology_arr, - size_t online_nodes_cnt, size_t cores_cnt) { +/* This function doesn't clean up resources on failure as we terminate the process anyway. */ +static int sanitize_numa_topology_info(struct pal_numa_topo_info* numa_topo_arr, + size_t online_nodes_cnt, size_t online_logical_cores_cnt, + size_t possible_logical_cores_cnt) { int ret; - if (online_nodes_cnt == 0 || cores_cnt == 0) - return -ENOENT; + size_t cpumask_cnt = BITS_TO_UINT32S(possible_logical_cores_cnt); + uint32_t* bitmap = calloc(cpumask_cnt, sizeof(*bitmap)); + if (!bitmap) + return -1; + + size_t total_cores_in_numa = 0; for (size_t idx = 0; idx < online_nodes_cnt; idx++) { - size_t cpumap_cnt; - ret = count_bits_set_from_resource_map(numa_topology_arr[idx].cpumap, &cpumap_cnt); - if (ret < 0 || !IS_IN_RANGE_INCL(cpumap_cnt, 1, cores_cnt)) - return -EINVAL; + ret = sanitize_hw_resource_range(&numa_topo_arr[idx].cpumap, 1, + online_logical_cores_cnt, 0, online_logical_cores_cnt); + if (ret < 0) { + log_error("Invalid numa_topo_arr[%zu].cpumap", idx); + goto out; + } + + /* Ensure that each NUMA has unique cores */ + for (size_t i = 0; i < numa_topo_arr[idx].cpumap.ranges_cnt; i++) { + size_t start = numa_topo_arr[idx].cpumap.ranges_arr[i].start; + size_t end = numa_topo_arr[idx].cpumap.ranges_arr[i].end; + for (size_t j = start; j <= end; j++) { + size_t index = j / BITS_IN_TYPE(uint32_t); + if (index >= cpumask_cnt) { + log_error("Invalid numa topology: Core %zu is beyond CPU mask limit", j); + ret = -1; + goto out; + } + + if (bitmap[index] & (1U << (j % BITS_IN_TYPE(uint32_t)))) { + log_error("Invalid numa_topology: Core %zu found in multiple numa nodes", j); + ret = -1; + goto out; + } + bitmap[index] |= 1U << (j % BITS_IN_TYPE(uint32_t)); + total_cores_in_numa++; + } + } - long cnt = sanitize_hw_resource_count(numa_topology_arr[idx].distance, /*ordered=*/false); - if (cnt < 0 || online_nodes_cnt != (size_t)cnt) - return -EINVAL; + size_t distances = numa_topo_arr[idx].distance.resource_cnt; + if (distances != online_nodes_cnt) { + log_error("Distance count is not same as the NUMA nodes count"); + ret = -1; + goto out; + } } - return 0; + + if (total_cores_in_numa != online_logical_cores_cnt) { + log_error("Invalid numa_topology: Mismatch between NUMA cores and online cores count"); + ret = -1; + goto out; + } + + ret = 0; + +out: + free(bitmap); + return ret; } extern void* g_enclave_base; @@ -482,11 +731,9 @@ static int print_warnings_on_insecure_configs(PAL_HANDLE parent_process) { return ret; } -/* This function doesn't clean up resources on failure: we terminate the process anyway. */ static int copy_and_sanitize_topo_info(struct pal_topo_info* uptr_topo_info, bool enable_sysfs_topology) { int ret; - int64_t cnt; /* Extract topology information from untrusted pointer. Note this is only a shallow copy * and we use this temp variable to do deep copy into `topo_info` struct part of @@ -498,170 +745,111 @@ static int copy_and_sanitize_topo_info(struct pal_topo_info* uptr_topo_info, return -1; } - size_t online_logical_cores_cnt = temp_topo_info.online_logical_cores_cnt; - if (!IS_IN_RANGE_INCL(online_logical_cores_cnt, 1, 1 << 16)) { - log_error("Invalid online_logical_cores_cnt: %zu", online_logical_cores_cnt); - return -1; - } + struct pal_topo_info* topo_info = &g_pal_public_state.topo_info; - size_t possible_logical_cores_cnt = temp_topo_info.possible_logical_cores_cnt; - if (!IS_IN_RANGE_INCL(possible_logical_cores_cnt, 1, 1 << 16)) { - log_error("Invalid possible_logical_cores_cnt: %zu", possible_logical_cores_cnt); + ret = copy_resource_range_to_enclave(&temp_topo_info.possible_logical_cores, + &topo_info->possible_logical_cores); + if (ret < 0) { + log_error("Copying possible_logical_cores failed"); return -1; } - - if (online_logical_cores_cnt > possible_logical_cores_cnt) { - log_error("Impossible configuration: more online cores (%zu) than possible cores (%zu)", - online_logical_cores_cnt, possible_logical_cores_cnt); + ret = sanitize_hw_resource_range(&topo_info->possible_logical_cores, 1, 1 << 16, 0, 1 << 16); + if (ret < 0) { + log_error("Invalid possible_logical_cores received from the host"); return -1; } - size_t physical_cores_per_socket = temp_topo_info.physical_cores_per_socket; - if (!IS_IN_RANGE_INCL(physical_cores_per_socket, 1, 1 << 13)) { - log_error("Invalid physical_cores_per_socket: %zu", physical_cores_per_socket); + ret = copy_resource_range_to_enclave(&temp_topo_info.online_logical_cores, + &topo_info->online_logical_cores); + if (ret < 0) { + log_error("Copying online_logical_cores failed"); return -1; } - - /* Allocate enclave memory to store "logical core -> socket" mappings */ - size_t* cpu_to_socket_arr = malloc(online_logical_cores_cnt * sizeof(*cpu_to_socket_arr)); - if (!cpu_to_socket_arr) { - log_error("Allocation for logical core -> socket mappings failed"); + ret = sanitize_hw_resource_range(&topo_info->online_logical_cores, 1, 1 << 16, 0, 1 << 16); + if (ret < 0) { + log_error("Invalid online_logical_cores received from the host"); return -1; } - if (!sgx_copy_to_enclave(cpu_to_socket_arr, - online_logical_cores_cnt * sizeof(*cpu_to_socket_arr), temp_topo_info.cpu_to_socket_arr, - online_logical_cores_cnt * sizeof(*temp_topo_info.cpu_to_socket_arr))) { - log_error("Copying cpu_to_socket_arr into the enclave failed"); + size_t online_logical_cores_cnt = topo_info->online_logical_cores.resource_cnt; + size_t possible_logical_cores_cnt = topo_info->possible_logical_cores.resource_cnt; + if (online_logical_cores_cnt > possible_logical_cores_cnt) { + log_error("Impossible configuration: more online cores (%zu) than possible cores (%zu)", + online_logical_cores_cnt, possible_logical_cores_cnt); return -1; } - /* Sanitize logical core -> socket mappings */ - ret = sanitize_socket_info(cpu_to_socket_arr, online_logical_cores_cnt); - if (ret < 0) { - log_error("Sanitization of logical core -> socket mappings failed"); + topo_info->physical_cores_per_socket = temp_topo_info.physical_cores_per_socket; + if (!IS_IN_RANGE_INCL(topo_info->physical_cores_per_socket, 1, 1 << 13)) { + log_error("Invalid physical_cores_per_socket: %zu received from the host", + topo_info->physical_cores_per_socket); return -1; } - /* TODO: Move to the end of this function along with other trusted copy operations once - * enable_sysfs_topology flag is removed */ - memset(&g_pal_public_state.topo_info, 0, sizeof(g_pal_public_state.topo_info)); - g_pal_public_state.topo_info.online_logical_cores_cnt = online_logical_cores_cnt; - g_pal_public_state.topo_info.possible_logical_cores_cnt = possible_logical_cores_cnt; - g_pal_public_state.topo_info.physical_cores_per_socket = physical_cores_per_socket; - g_pal_public_state.topo_info.cpu_to_socket_arr = cpu_to_socket_arr; - /* Advanced host topology information */ if (!enable_sysfs_topology) { /* TODO: temporary measure, remove it once sysfs topology is thoroughly validated */ return 0; } - cnt = sanitize_hw_resource_count(temp_topo_info.online_logical_cores, /*ordered=*/true); - if (cnt < 0 || (size_t)cnt != online_logical_cores_cnt) { - log_error("Malformed \"online\" cores string received from the host"); - return -1; - } - - cnt = sanitize_hw_resource_count(temp_topo_info.possible_logical_cores, /*ordered=*/true); - if (cnt < 0 || (size_t)cnt != possible_logical_cores_cnt) { - log_error("Malformed \"possible\" cores string received from the host"); - return -1; - } - - size_t online_nodes_cnt = temp_topo_info.online_nodes_cnt; - if (!IS_IN_RANGE_INCL(online_nodes_cnt, 1, 1 << 8)) { - log_error("Invalid online_nodes_cnt: %zu", online_nodes_cnt); - return -1; - } - - cnt = sanitize_hw_resource_count(temp_topo_info.online_nodes, /*ordered=*/true); - if (cnt < 0 || (size_t)cnt != online_nodes_cnt) { - log_error("Malformed \"online\" nodes string received from the host"); + topo_info->sockets_cnt = temp_topo_info.sockets_cnt; + /* Virtual environments such as QEMU may assign each core to a separate socket/package with + * one or more NUMA nodes. So we check against the number of online logical cores. */ + if (!IS_IN_RANGE_INCL(topo_info->sockets_cnt, 1, online_logical_cores_cnt)) { + log_error("Invalid sockets_cnt: %zu received from the host", topo_info->sockets_cnt); return -1; } - struct pal_core_topo_info* core_topology_arr = - malloc(online_logical_cores_cnt * sizeof(*core_topology_arr)); - if (!core_topology_arr) { - log_error("Allocation for core topology failed"); + topo_info->cache_indices_cnt = temp_topo_info.cache_indices_cnt; + if (!IS_IN_RANGE_INCL(topo_info->cache_indices_cnt, 1, 1 << 4)) { + log_error("Invalid cache_indices_cnt: %zu received from the host", + topo_info->cache_indices_cnt); return -1; } - if (!sgx_copy_to_enclave(core_topology_arr, - online_logical_cores_cnt * sizeof(*core_topology_arr), temp_topo_info.core_topology_arr, - online_logical_cores_cnt * sizeof(*temp_topo_info.core_topology_arr))) { - log_error("Copying core_topology_arr into the enclave failed"); + /* Allocate enclave memory to store core topology info */ + ret = sgx_copy_core_topo_to_enclave(temp_topo_info.core_topo_arr, online_logical_cores_cnt, + topo_info->cache_indices_cnt, + &topo_info->core_topo_arr); + if (ret < 0) { + log_error("Copying core_topo_arr into the enclave failed"); return -1; } - size_t cache_indices_cnt = temp_topo_info.cache_indices_cnt; - if (!IS_IN_RANGE_INCL(cache_indices_cnt, 1, 1 << 4)) { - log_error("Invalid cache_indices_cnt: %zu", cache_indices_cnt); + ret = sanitize_core_topology_info(topo_info->core_topo_arr, online_logical_cores_cnt, + topo_info->cache_indices_cnt, topo_info->sockets_cnt); + if (ret < 0) { + log_error("Sanitization of core_topology failed"); return -1; } - /* Copy cache info for each online core */ - for (size_t i = 0; i < online_logical_cores_cnt; i++) { - struct pal_core_cache_info* cache_info_arr = - malloc(cache_indices_cnt * sizeof(*cache_info_arr)); - if (!cache_info_arr) { - log_error("Allocation for cache_info_arr failed"); - return -1; - } - - /* Contents of shallow pointer `core_topology_arr[i].cache_info_arr` are copied - * into `cache_info_arr` */ - if (!sgx_copy_to_enclave(cache_info_arr, cache_indices_cnt * sizeof(*cache_info_arr), - core_topology_arr[i].cache_info_arr, - cache_indices_cnt * sizeof(*core_topology_arr[i].cache_info_arr))) { - log_error("Copying cache_info_arr into the enclave failed"); - return -1; - } - - /* Update the shallow pointer `core_topology_arr[i].cache_info_arr` with the one allocated - * inside the enclave */ - core_topology_arr[i].cache_info_arr = cache_info_arr; - } - - ret = sanitize_core_topology_info(core_topology_arr, online_logical_cores_cnt, cache_indices_cnt); + ret = copy_resource_range_to_enclave(&temp_topo_info.online_nodes, &topo_info->online_nodes); if (ret < 0) { - log_error("Sanitization of core_topology_arr failed"); + log_error("Copying online_nodes into the enclave failed"); return -1; } - struct pal_numa_topo_info* numa_topology_arr = - malloc(online_nodes_cnt * sizeof(*numa_topology_arr)); - if (!numa_topology_arr) { - log_error("Allocation for numa_topology_arr failed"); + ret = sanitize_hw_resource_range(&topo_info->online_nodes, 1, 1 << 16, 0, 1 << 16); + if (ret < 0) { + log_error("Invalid online_nodes received from the host"); return -1; } - if (!sgx_copy_to_enclave(numa_topology_arr, online_nodes_cnt * sizeof(*numa_topology_arr), - temp_topo_info.numa_topology_arr, - online_nodes_cnt * sizeof(*temp_topo_info.numa_topology_arr))) { - log_error("Copying numa_topology_arr into the enclave failed"); + size_t online_nodes_cnt = topo_info->online_nodes.resource_cnt; + ret = sgx_copy_numa_topo_to_enclave(temp_topo_info.numa_topo_arr, online_nodes_cnt, + &topo_info->numa_topo_arr); + if (ret < 0) { + log_error("Copying numa_topo_arr into the enclave failed"); return -1; } - ret = sanitize_numa_topology_info(numa_topology_arr, online_nodes_cnt, online_logical_cores_cnt); + ret = sanitize_numa_topology_info(topo_info->numa_topo_arr, online_nodes_cnt, + online_logical_cores_cnt, possible_logical_cores_cnt); if (ret < 0) { - log_error("Sanitization of numa_topology_arr failed"); + log_error("Sanitization of numa_topo_arr failed"); return -1; } - g_pal_public_state.topo_info.online_nodes_cnt = online_nodes_cnt; - g_pal_public_state.topo_info.cache_indices_cnt = cache_indices_cnt; - - COPY_ARRAY(g_pal_public_state.topo_info.online_logical_cores, - temp_topo_info.online_logical_cores); - COPY_ARRAY(g_pal_public_state.topo_info.possible_logical_cores, - temp_topo_info.possible_logical_cores); - COPY_ARRAY(g_pal_public_state.topo_info.online_nodes, temp_topo_info.online_nodes); - - g_pal_public_state.topo_info.core_topology_arr = core_topology_arr; - g_pal_public_state.topo_info.numa_topology_arr = numa_topology_arr; - return 0; } diff --git a/Pal/src/host/Linux-SGX/enclave_ocalls.c b/Pal/src/host/Linux-SGX/enclave_ocalls.c index 570ec9df19..f3661ec4f8 100644 --- a/Pal/src/host/Linux-SGX/enclave_ocalls.c +++ b/Pal/src/host/Linux-SGX/enclave_ocalls.c @@ -1883,7 +1883,7 @@ int ocall_sched_setaffinity(void* tcs, size_t cpumask_size, void* cpu_mask) { static bool is_cpumask_valid(void* cpu_mask, size_t cpumask_size) { size_t max_cpumask_bits = cpumask_size * BITS_IN_BYTE; - size_t valid_cpumask_bits = g_pal_public_state.topo_info.online_logical_cores_cnt; + size_t valid_cpumask_bits = g_pal_public_state.topo_info.online_logical_cores.resource_cnt; size_t invalid_bits = max_cpumask_bits - valid_cpumask_bits; if (invalid_bits == 0) diff --git a/Pal/src/host/Linux-common/topo_info.c b/Pal/src/host/Linux-common/topo_info.c index 09743edb0d..18e5505db3 100644 --- a/Pal/src/host/Linux-common/topo_info.c +++ b/Pal/src/host/Linux-common/topo_info.c @@ -16,68 +16,143 @@ #include "syscall.h" #include "topo_info.h" -/* TODO: This shouldn't mix error code and returned value in one variable. */ -int get_hw_resource(const char* filename, bool count) { - int fd = DO_SYSCALL(open, filename, O_RDONLY | O_CLOEXEC, 0); - if (fd < 0) - return fd; +/* Opens a pseudo-file describing HW resources and simply reads the value stored in the file. + * Returns UNIX error code on failure and 0 on success. */ +static int get_hw_resource_value(const char* filename, size_t* out_value) { + assert(out_value); - char buf[64]; - int ret = DO_SYSCALL(read, fd, buf, sizeof(buf) - 1); - DO_SYSCALL(close, fd); + char str[PAL_SYSFS_BUF_FILESZ]; + int ret = read_file_buffer(filename, str, sizeof(str) - 1); if (ret < 0) return ret; - buf[ret] = '\0'; /* ensure null-terminated buf even in partial read */ + str[ret] = '\0'; /* ensure null-terminated buf even in partial read */ + + char* end; + long val = strtol(str, &end, 10); + if (val < 0) + return -EINVAL; + + if (*end != '\n' && *end != '\0' && *end != 'K') { + /* Illegal character found */ + return -EINVAL; + } + + if (*end == 'K') { + if (__builtin_mul_overflow(val, 1024, &val)) + return -EOVERFLOW; + } - char* ptr = buf; - int resource_cnt = 0; - int retval = -ENOENT; + *out_value = val; + return 0; +} + +/* Opens a pseudo-file describing HW resources such as online CPUs and counts the number of + * HW resources and their ranges present in the file. The result is stored in `out_info`. + * Returns UNIX error code on failure and 0 on success. + * N.B: Understands complex formats like "1,3-5,7". */ +static int get_hw_resource_range(const char* filename, struct pal_res_range_info* out_info) { + assert(out_info); + + /* Clear user supplied buffer */ + out_info->resource_cnt = 0; + out_info->ranges_cnt = 0; + out_info->ranges_arr = NULL; + + char str[PAL_SYSFS_BUF_FILESZ]; + int ret = read_file_buffer(filename, str, sizeof(str) - 1); + if (ret < 0) + return ret; + + str[ret] = '\0'; /* ensure null-terminated buf even in partial read */ + + char* ptr = str; while (*ptr) { - while (*ptr == ' ' || *ptr == '\t' || *ptr == ',') + while (*ptr == ' ' || *ptr == ',') ptr++; char* end; - long firstint = strtol(ptr, &end, 10); - if (firstint < 0 || firstint > INT_MAX) - return -ENOENT; + long start_val = strtol(ptr, &end, 10); + if (start_val < 0) { + ret = -ENOENT; + goto fail; + } if (ptr == end) break; - /* caller wants to read an int stored in the file */ - if (!count) { - if (*end == '\n' || *end == '\0') - retval = (int)firstint; - return retval; - } + size_t range_start; + size_t range_end; - /* caller wants to count the number of HW resources */ - if (*end == '\0' || *end == ',' || *end == '\n') { - /* single HW resource index, count as one more */ - resource_cnt++; + if (*end == '\0' || *end == ',' || *end == '\n' || *end == ' ') { + range_start = start_val; + range_end = start_val; + + if (__builtin_add_overflow(out_info->resource_cnt, 1, &out_info->resource_cnt)) { + ret = -EOVERFLOW; + goto fail; + } } else if (*end == '-') { - /* HW resource range, count how many HW resources are in range */ ptr = end + 1; - long secondint = strtol(ptr, &end, 10); - if (secondint < 0 || secondint > INT_MAX) - return -EINVAL; - - if (secondint > firstint) { - long diff = secondint - firstint; - long total_cnt; - if (__builtin_add_overflow(resource_cnt, diff, &total_cnt) || total_cnt >= INT_MAX) - return -EINVAL; - resource_cnt += (int)secondint - (int)firstint + 1; //inclusive (e.g., 0-7, or 8-16) + long end_val = strtol(ptr, &end, 10); + if (end_val < 0 || end_val < start_val) { + ret = -EINVAL; + goto fail; + } + + range_start = start_val; + range_end = end_val; + + size_t diff = end_val - start_val + 1; /* +1 because of inclusive range */ + if (__builtin_add_overflow(out_info->resource_cnt, diff, &out_info->resource_cnt)) { + ret = -EOVERFLOW; + goto fail; } + } else { + /* Illegal character found */ + ret = -EINVAL; + goto fail; + } + + /* Update range info */ + out_info->ranges_cnt++; + + /* Realloc the array of ranges (expand by one range) */ + size_t new_size = sizeof(struct pal_range_info) * out_info->ranges_cnt; + size_t old_size = new_size - sizeof(struct pal_range_info); + /* TODO: Optimize realloc by doing some overestimation and trimming later once the + * range count is known */ + struct pal_range_info* tmp = malloc(new_size); + if (!tmp) { + ret = -ENOMEM; + goto fail; } + + if (out_info->ranges_arr) { + memcpy(tmp, out_info->ranges_arr, old_size); + free(out_info->ranges_arr); + } + out_info->ranges_arr = tmp; + out_info->ranges_arr[out_info->ranges_cnt - 1].start = range_start; + out_info->ranges_arr[out_info->ranges_cnt - 1].end = range_end; + ptr = end; } - if (count && resource_cnt > 0) - retval = resource_cnt; + if (!out_info->resource_cnt || !out_info->ranges_cnt) { + ret = -EINVAL; + goto fail; + } + + return 0; + +fail: + free(out_info->ranges_arr); + out_info->resource_cnt = 0; + out_info->ranges_cnt = 0; + out_info->ranges_arr = NULL; - return retval; + return ret; } ssize_t read_file_buffer(const char* filename, char* buf, size_t count) { @@ -101,12 +176,15 @@ ssize_t read_file_buffer(const char* filename, char* buf, size_t count) { buf[ret] = '\0'; \ }) -/* Returns number of cache levels present on this system by counting "indexX" dir entries under - * `/sys/devices/system/cpu/cpuX/cache` on success and negative UNIX error code on failure. */ -/* TODO: This shouldn't mix error code and returned value in one variable. */ -static int get_cache_levels_cnt(const char* path) { +/* This function stores the number of cache levels present on the system by counting "indexX" dir + * entries under `/sys/devices/system/cpu/cpuX/cache` in `out_cache_indices_cnt`. Returns 0 on + * success and negative UNIX error code on failure. */ +static int get_cache_levels_cnt(const char* path, size_t* out_cache_indices_cnt) { + assert(out_cache_indices_cnt); + char buf[1024]; - int dirs_cnt = 0; + int ret; + size_t dirs_cnt = 0; int fd = DO_SYSCALL(open, path, O_RDONLY | O_DIRECTORY); if (fd < 0) @@ -115,7 +193,7 @@ static int get_cache_levels_cnt(const char* path) { while (true) { int nread = DO_SYSCALL(getdents64, fd, buf, 1024); if (nread < 0) { - dirs_cnt = nread; + ret = nread; goto out; } @@ -130,213 +208,246 @@ static int get_cache_levels_cnt(const char* path) { } } + if (!dirs_cnt) { + ret = -ENOENT; + goto out; + } + + *out_cache_indices_cnt = dirs_cnt; + ret = 0; + out: DO_SYSCALL(close, fd); - return dirs_cnt ?: -ENOENT; + return ret; } static int get_cache_topo_info(size_t cache_indices_cnt, size_t core_idx, struct pal_core_cache_info** out_cache_info_arr) { int ret; - char filename[128]; + struct pal_core_cache_info* cache_info_arr = malloc(cache_indices_cnt * sizeof(*cache_info_arr)); if (!cache_info_arr) { return -ENOMEM; } + char dirname[PAL_SYSFS_PATH_SIZE]; + char filename[PAL_SYSFS_PATH_SIZE]; for (size_t cache_idx = 0; cache_idx < cache_indices_cnt; cache_idx++) { - struct pal_core_cache_info* cache_info = &cache_info_arr[cache_idx]; - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%zu/cache/index%zu/shared_cpu_map", core_idx, - cache_idx); - READ_FILE_BUFFER(filename, cache_info->shared_cpu_map, /*failure_label=*/out_cache); - - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%zu/cache/index%zu/level", core_idx, cache_idx); - READ_FILE_BUFFER(filename, cache_info->level, /*failure_label=*/out_cache); - - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%zu/cache/index%zu/type", core_idx, cache_idx); - READ_FILE_BUFFER(filename, cache_info->type, /*failure_label=*/out_cache); - - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%zu/cache/index%zu/size", core_idx, cache_idx); - READ_FILE_BUFFER(filename, cache_info->size, /*failure_label=*/out_cache); - - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%zu/cache/index%zu/coherency_line_size", core_idx, - cache_idx); - READ_FILE_BUFFER(filename, cache_info->coherency_line_size, /*failure_label=*/out_cache); - - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%zu/cache/index%zu/number_of_sets", core_idx, - cache_idx); - READ_FILE_BUFFER(filename, cache_info->number_of_sets, /*failure_label=*/out_cache); - - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%zu/cache/index%zu/physical_line_partition", core_idx, - cache_idx); - READ_FILE_BUFFER(filename, cache_info->physical_line_partition, - /*failure_label=*/out_cache); + ret = snprintf(dirname, sizeof(dirname), "/sys/devices/system/cpu/cpu%zu/cache/index%zu", + core_idx, cache_idx); + if (ret < 0) + goto fail; + + ret = snprintf(filename, sizeof(filename), "%s/shared_cpu_list", dirname); + if (ret < 0) + goto fail; + ret = get_hw_resource_range(filename, &cache_info_arr[cache_idx].shared_cpu_map); + if (ret < 0) + goto fail; + + ret = snprintf(filename, sizeof(filename), "%s/level", dirname); + if (ret < 0) + goto fail; + ret = get_hw_resource_value(filename, &cache_info_arr[cache_idx].level); + if (ret < 0) + goto fail; + + char type[PAL_SYSFS_BUF_FILESZ] = {'\0'}; + ret = snprintf(filename, sizeof(filename), "%s/type", dirname); + if (ret < 0) + goto fail; + ret = read_file_buffer(filename, type, sizeof(type) - 1); + if (ret < 0) + goto fail; + type[ret] = '\0'; + + if (!strcmp(type, "Unified\n")) { + cache_info_arr[cache_idx].type = CACHE_TYPE_UNIFIED; + } else if (!strcmp(type, "Instruction\n")) { + cache_info_arr[cache_idx].type = CACHE_TYPE_INSTRUCTION; + } else if (!strcmp(type, "Data\n")) { + cache_info_arr[cache_idx].type = CACHE_TYPE_DATA; + } else { + ret = -EINVAL; + goto fail; + } + + ret = snprintf(filename, sizeof(filename), "%s/size", dirname); + if (ret < 0) + goto fail; + ret = get_hw_resource_value(filename, &cache_info_arr[cache_idx].size); + if (ret < 0) + goto fail; + + ret = snprintf(filename, sizeof(filename), "%s/coherency_line_size", dirname); + if (ret < 0) + goto fail; + ret = get_hw_resource_value(filename, &cache_info_arr[cache_idx].coherency_line_size); + if (ret < 0) + goto fail; + + ret = snprintf(filename, sizeof(filename), "%s/number_of_sets", dirname); + if (ret < 0) + goto fail; + ret = get_hw_resource_value(filename, &cache_info_arr[cache_idx].number_of_sets); + if (ret < 0) + goto fail; + + ret = snprintf(filename, sizeof(filename), "%s/physical_line_partition", dirname); + if (ret < 0) + goto fail; + ret = get_hw_resource_value(filename, &cache_info_arr[cache_idx].physical_line_partition); + if (ret < 0) + goto fail; } *out_cache_info_arr = cache_info_arr; return 0; -out_cache: +fail: free(cache_info_arr); return ret; } /* Get core topology-related info */ static int get_core_topo_info(struct pal_topo_info* topo_info) { - int ret; - - /* we cannot use CPUID(0xb) because it counts even disabled-by-BIOS cores (e.g. HT cores); - * instead we extract info on total number of logical cores, number of physical cores, - * SMT support etc. by parsing sysfs pseudo-files */ - - READ_FILE_BUFFER("/sys/devices/system/cpu/online", topo_info->online_logical_cores, - /*failure_label=*/out); - - READ_FILE_BUFFER("/sys/devices/system/cpu/possible", topo_info->possible_logical_cores, - /*failure_label=*/out); - - ret = get_hw_resource("/sys/devices/system/cpu/online", /*count=*/true); + int ret = get_hw_resource_range("/sys/devices/system/cpu/online", + &topo_info->online_logical_cores); if (ret < 0) return ret; - size_t online_logical_cores_cnt = (size_t)ret; - topo_info->online_logical_cores_cnt = online_logical_cores_cnt; - ret = get_cache_levels_cnt("/sys/devices/system/cpu/cpu0/cache"); + ret = get_hw_resource_range("/sys/devices/system/cpu/possible", + &topo_info->possible_logical_cores); if (ret < 0) return ret; - size_t cache_indices_cnt = (size_t)ret; - topo_info->cache_indices_cnt = cache_indices_cnt; - ret = get_hw_resource("/sys/devices/system/cpu/possible", /*count=*/true); - if (ret < 0) { - return ret; - } - size_t possible_logical_cores_cnt = (size_t)ret; - topo_info->possible_logical_cores_cnt = possible_logical_cores_cnt; + size_t online_logical_cores_cnt = topo_info->online_logical_cores.resource_cnt; + + size_t possible_logical_cores_cnt = topo_info->possible_logical_cores.resource_cnt; /* TODO: correctly support offline cores */ if (possible_logical_cores_cnt > online_logical_cores_cnt) { - log_warning("some CPUs seem to be offline; Gramine doesn't take this into account which " - "may lead to subpar performance"); + log_error("Some CPUs seem to be offline; Gramine currently doesn't support core offlining"); + return -EINVAL; } - ret = get_hw_resource("/sys/devices/system/cpu/cpu0/topology/core_siblings_list", - /*count=*/true); - if (ret < 0) { - return ret; - } - size_t core_siblings_cnt = (size_t)ret; - ret = get_hw_resource("/sys/devices/system/cpu/cpu0/topology/thread_siblings_list", - /*count=*/true); - if (ret < 0) { + ret = get_cache_levels_cnt("/sys/devices/system/cpu/cpu0/cache", &topo_info->cache_indices_cnt); + if (ret < 0) return ret; - } - size_t smt_siblings_cnt = (size_t)ret; - topo_info->physical_cores_per_socket = core_siblings_cnt / smt_siblings_cnt; - /* array of "logical core -> socket" mappings */ - size_t* cpu_to_socket_arr = malloc(online_logical_cores_cnt * sizeof(*cpu_to_socket_arr)); - if (!cpu_to_socket_arr) { + struct pal_core_topo_info* core_topo_arr = + malloc(online_logical_cores_cnt * sizeof(*core_topo_arr)); + if (!core_topo_arr) return -ENOMEM; - } - char filename[128]; + size_t current_max_socket = 0; + char dirname[PAL_SYSFS_PATH_SIZE]; + char filename[PAL_SYSFS_PATH_SIZE]; for (size_t idx = 0; idx < online_logical_cores_cnt; idx++) { - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%zu/topology/physical_package_id", idx); - ret = get_hw_resource(filename, /*count=*/false); - if (ret < 0) { - log_warning("Cannot read %s", filename); - goto out_cpu_to_socket; - } - cpu_to_socket_arr[idx] = (size_t)ret; - } - topo_info->cpu_to_socket_arr = cpu_to_socket_arr; - - struct pal_core_topo_info* core_topology_arr = - malloc(online_logical_cores_cnt * sizeof(*core_topology_arr)); - if (!core_topology_arr) - return -ENOMEM; + ret = snprintf(dirname, sizeof(dirname), "/sys/devices/system/cpu/cpu%zu", idx); + if (ret < 0) + goto out; - for (size_t idx = 0; idx < online_logical_cores_cnt; idx++) { - /* cpu0 is always online and thus the "online" file is not present. */ if (idx != 0) { - snprintf(filename, sizeof(filename), "/sys/devices/system/cpu/cpu%zu/online", idx); - READ_FILE_BUFFER(filename, core_topology_arr[idx].is_logical_core_online, - /*failure_label=*/out_topology); + ret = snprintf(filename, sizeof(filename), "%s/online", dirname); + if (ret < 0) + goto out; + + size_t is_logical_core_online = 0; + ret = get_hw_resource_value(filename, &is_logical_core_online); + if (ret < 0) + goto out; + core_topo_arr[idx].is_logical_core_online = (bool)is_logical_core_online; + } else { + /* cpu0 is always online and thus the "online" file is not present. */ + core_topo_arr[idx].is_logical_core_online = true; } - snprintf(filename, sizeof(filename), "/sys/devices/system/cpu/cpu%zu/topology/core_id", idx); - READ_FILE_BUFFER(filename, core_topology_arr[idx].core_id, /*failure_label=*/out_topology); + ret = snprintf(filename, sizeof(filename), "%s/topology/core_id", dirname); + if (ret < 0) + goto out; + ret = get_hw_resource_value(filename, &core_topo_arr[idx].core_id); + if (ret < 0) + goto out; + + ret = snprintf(filename, sizeof(filename), "%s/topology/core_siblings_list", dirname); + if (ret < 0) + goto out; + ret = get_hw_resource_range(filename, &core_topo_arr[idx].core_siblings); + if (ret < 0) + goto out; - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%zu/topology/core_siblings", idx); - READ_FILE_BUFFER(filename, core_topology_arr[idx].core_siblings, - /*failure_label=*/out_topology); + ret = snprintf(filename, sizeof(filename), "%s/topology/thread_siblings_list", dirname); + if (ret < 0) + goto out; + ret = get_hw_resource_range(filename, &core_topo_arr[idx].thread_siblings); + if (ret < 0) + goto out; + + ret = snprintf(filename, sizeof(filename), "%s/topology/physical_package_id", dirname); + if (ret < 0) + goto out; + ret = get_hw_resource_value(filename, &core_topo_arr[idx].socket_id); + if (ret < 0) + goto out; - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%zu/topology/thread_siblings", idx); - READ_FILE_BUFFER(filename, core_topology_arr[idx].thread_siblings, - /*failure_label=*/out_topology); + if (core_topo_arr[idx].socket_id > current_max_socket) + current_max_socket = core_topo_arr[idx].socket_id; - ret = get_cache_topo_info(cache_indices_cnt, idx, &core_topology_arr[idx].cache_info_arr); + ret = get_cache_topo_info(topo_info->cache_indices_cnt, idx, + &core_topo_arr[idx].cache_info_arr); if (ret < 0) - goto out_topology; + goto out; } - topo_info->core_topology_arr = core_topology_arr; + + topo_info->core_topo_arr = core_topo_arr; + topo_info->sockets_cnt = current_max_socket + 1; + topo_info->physical_cores_per_socket = core_topo_arr[0].core_siblings.resource_cnt / + core_topo_arr[0].thread_siblings.resource_cnt; return 0; -out_topology: - free(core_topology_arr); -out_cpu_to_socket: - free(cpu_to_socket_arr); out: + free(core_topo_arr); return ret; } /* Get NUMA topology-related info */ static int get_numa_topo_info(struct pal_topo_info* topo_info) { - int ret; - READ_FILE_BUFFER("/sys/devices/system/node/online", topo_info->online_nodes, - /*failure_label=*/out); - - ret = get_hw_resource("/sys/devices/system/node/online", /*count=*/true); + int ret = get_hw_resource_range("/sys/devices/system/node/online", &topo_info->online_nodes); if (ret < 0) return ret; - size_t online_nodes_cnt = (size_t)ret; - topo_info->online_nodes_cnt = online_nodes_cnt; + size_t online_nodes_cnt = topo_info->online_nodes.resource_cnt; - struct pal_numa_topo_info* numa_topology_arr = - malloc(online_nodes_cnt * sizeof(*numa_topology_arr)); - if (!numa_topology_arr) + struct pal_numa_topo_info* numa_topo_arr = malloc(online_nodes_cnt * sizeof(*numa_topo_arr)); + if (!numa_topo_arr) return -ENOMEM; - char filename[128]; + char filename[PAL_SYSFS_PATH_SIZE]; for (size_t idx = 0; idx < online_nodes_cnt; idx++) { - snprintf(filename, sizeof(filename), "/sys/devices/system/node/node%zu/cpumap", idx); - READ_FILE_BUFFER(filename, numa_topology_arr[idx].cpumap, /*failure_label=*/out_topology); + ret = snprintf(filename, sizeof(filename), "/sys/devices/system/node/node%zu/cpulist", idx); + if (ret < 0) + goto out; + ret = get_hw_resource_range(filename, &numa_topo_arr[idx].cpumap); + if (ret < 0) + goto out; - snprintf(filename, sizeof(filename), "/sys/devices/system/node/node%zu/distance", idx); - READ_FILE_BUFFER(filename, numa_topology_arr[idx].distance, /*failure_label=*/out_topology); + ret = snprintf(filename, sizeof(filename), "/sys/devices/system/node/node%zu/distance", idx); + if (ret < 0) + goto out; + ret = get_hw_resource_range(filename, &numa_topo_arr[idx].distance); + if (ret < 0) + goto out; /* Since our /sys fs doesn't support writes, set persistent hugepages to their default value * of zero */ - memcpy(numa_topology_arr[idx].hugepages[HUGEPAGES_2M].nr_hugepages, "0\n", 3); - memcpy(numa_topology_arr[idx].hugepages[HUGEPAGES_1G].nr_hugepages, "0\n", 3); + numa_topo_arr[idx].nr_hugepages[HUGEPAGES_2M] = 0; + numa_topo_arr[idx].nr_hugepages[HUGEPAGES_1G] = 0; } - topo_info->numa_topology_arr = numa_topology_arr; + topo_info->numa_topo_arr = numa_topo_arr; return 0; -out_topology: - free(numa_topology_arr); out: + free(numa_topo_arr); return ret; } diff --git a/common/include/api.h b/common/include/api.h index eb1798a3c4..8cff75afe9 100644 --- a/common/include/api.h +++ b/common/include/api.h @@ -81,6 +81,7 @@ typedef ptrdiff_t ssize_t; #define BITS_IN_BYTE 8 #define BITS_IN_TYPE(type) (sizeof(type) * BITS_IN_BYTE) +#define BITS_TO_UINT32S(nr) DIV_ROUND_UP(nr, BITS_IN_TYPE(uint32_t)) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_IN_TYPE(long)) /* Note: This macro is not intended for use when nbits == BITS_IN_TYPE(type) */ #define SET_HIGHEST_N_BITS(type, nbits) (~(((uint64_t)1 << (BITS_IN_TYPE(type) - (nbits))) - 1))