diff --git a/LibOS/shim/include/shim_fs_pseudo.h b/LibOS/shim/include/shim_fs_pseudo.h index cfcb9e533e..0870957a2f 100644 --- a/LibOS/shim/include/shim_fs_pseudo.h +++ b/LibOS/shim/include/shim_fs_pseudo.h @@ -209,4 +209,23 @@ int sys_cache_load(struct shim_dentry* dent, char** out_data, size_t* out_size); bool sys_cpu_online_name_exists(struct shim_dentry* parent, const char* name); int sys_cpu_online_list_names(struct shim_dentry* parent, readdir_callback_t callback, void* arg); +/* Converts an integer to a string, optionally appending a given single-letter unit suffix + * (see enum size_multiplier for possible values). + * Note: This function adds a newline at end of the string. */ +int sys_convert_int_to_sizestr(uint64_t val, enum size_multiplier size_mult, char* str, + size_t buf_size); + +/* Converts PAL_RES_RANGE_INFO to a string representation. + * Example output when sep == ',': "10-63,68,70-127". + * Note: This function adds a newline at end of the string. */ +int sys_convert_ranges_to_str(const PAL_RES_RANGE_INFO* resource_range_info, char* str, + size_t buf_size, const char* sep); + +/* Converts PAL_RES_RANGE_INFO to a sysfs CPU bitmask representation with bitmask size based on the + * possible cores count in the system. + * Example output for 64 cores in total and ranges 0-15,48-55: "00ff0000,0000ffff". + * Note: This function adds a newline at end of the string. */ +int sys_convert_ranges_to_cpu_bitmap_str(const PAL_RES_RANGE_INFO* resource_range_info, char* str, + size_t buf_size); + #endif /* SHIM_FS_PSEUDO_H_ */ diff --git a/LibOS/shim/src/fs/proc/info.c b/LibOS/shim/src/fs/proc/info.c index eff439ec53..65acc2c628 100644 --- a/LibOS/shim/src/fs/proc/info.c +++ b/LibOS/shim/src/fs/proc/info.c @@ -113,7 +113,8 @@ int proc_cpuinfo_load(struct shim_dentry* dent, char** out_data, size_t* out_siz size += ret; \ } while (0) - for (size_t n = 0; n < g_pal_control->cpu_info.online_logical_cores; n++) { + uint64_t online_logical_cores = g_pal_control->topo_info.online_logical_cores.resource_count; + for (uint64_t n = 0; n < online_logical_cores; n++) { /* Below strings must match exactly the strings retrieved from /proc/cpuinfo * (see Linux's arch/x86/kernel/cpu/proc.c) */ ADD_INFO("processor\t: %lu\n", n); @@ -122,9 +123,9 @@ int proc_cpuinfo_load(struct shim_dentry* dent, char** out_data, size_t* out_siz ADD_INFO("model\t\t: %lu\n", g_pal_control->cpu_info.cpu_model); ADD_INFO("model name\t: %s\n", g_pal_control->cpu_info.cpu_brand); ADD_INFO("stepping\t: %lu\n", g_pal_control->cpu_info.cpu_stepping); - ADD_INFO("physical id\t: %d\n", g_pal_control->cpu_info.cpu_socket[n]); + ADD_INFO("physical id\t: %d\n", g_pal_control->topo_info.core_topology[n].cpu_socket); ADD_INFO("core id\t\t: %lu\n", n); - ADD_INFO("cpu cores\t: %lu\n", g_pal_control->cpu_info.physical_cores_per_socket); + ADD_INFO("cpu cores\t: %lu\n", g_pal_control->topo_info.physical_cores_per_socket); double bogomips = g_pal_control->cpu_info.cpu_bogomips; // Apparently Gramine snprintf cannot into floats. ADD_INFO("bogomips\t: %lu.%02lu\n", (unsigned long)bogomips, diff --git a/LibOS/shim/src/fs/sys/cache_info.c b/LibOS/shim/src/fs/sys/cache_info.c index a75e06c34d..07f3728e40 100644 --- a/LibOS/shim/src/fs/sys/cache_info.c +++ b/LibOS/shim/src/fs/sys/cache_info.c @@ -27,25 +27,40 @@ int sys_cache_load(struct shim_dentry* dent, char** out_data, size_t* out_size) const char* name = dent->name; PAL_CORE_CACHE_INFO* cache = &g_pal_control->topo_info.core_topology[cpu_num].cache[cache_num]; - const char* str; + char str[PAL_SYSFS_MAP_FILESZ] = {'\0'}; if (strcmp(name, "shared_cpu_map") == 0) { - str = cache->shared_cpu_map; + ret = sys_convert_ranges_to_cpu_bitmap_str(&cache->shared_cpu_map, str, sizeof(str)); } else if (strcmp(name, "level") == 0) { - str = cache->level; + ret = sys_convert_int_to_sizestr(cache->level, MULTIPLIER_NONE, str, sizeof(str)); } else if (strcmp(name, "type") == 0) { - str = cache->type; + switch (cache->type) { + case CACHE_TYPE_DATA: + ret = snprintf(str, sizeof(str), "%s", "Data\n"); + break; + case CACHE_TYPE_INSTRUCTION: + ret = snprintf(str, sizeof(str), "%s", "Instruction\n"); + break; + case CACHE_TYPE_UNIFIED: + ret = snprintf(str, sizeof(str), "%s", "Unified\n"); + break; + default: + ret = -ENOENT; + } } else if (strcmp(name, "size") == 0) { - str = cache->size; + ret = sys_convert_int_to_sizestr(cache->size, cache->size_multiplier, str, sizeof(str)); } else if (strcmp(name, "coherency_line_size") == 0) { - str = cache->coherency_line_size; + ret = sys_convert_int_to_sizestr(cache->coherency_line_size, MULTIPLIER_NONE, str, sizeof(str)); } else if (strcmp(name, "number_of_sets") == 0) { - str = cache->number_of_sets; + ret = sys_convert_int_to_sizestr(cache->number_of_sets, MULTIPLIER_NONE, str, sizeof(str)); } else if (strcmp(name, "physical_line_partition") == 0) { - str = cache->physical_line_partition; + ret = sys_convert_int_to_sizestr(cache->physical_line_partition, MULTIPLIER_NONE, str, + sizeof(str)); } else { log_debug("unrecognized file: %s", name); - return -ENOENT; + ret = -ENOENT; } + if (ret < 0) + return ret; return sys_load(str, out_data, out_size); } diff --git a/LibOS/shim/src/fs/sys/cpu_info.c b/LibOS/shim/src/fs/sys/cpu_info.c index 7d25240a11..174a024f19 100644 --- a/LibOS/shim/src/fs/sys/cpu_info.c +++ b/LibOS/shim/src/fs/sys/cpu_info.c @@ -13,18 +13,23 @@ #include "shim_fs_pseudo.h" int sys_cpu_general_load(struct shim_dentry* dent, char** out_data, size_t* out_size) { + int ret; const char* name = dent->name; - const char* str; + char str[PAL_SYSFS_BUF_FILESZ] = {'\0'}; if (strcmp(name, "online") == 0) { - str = g_pal_control->topo_info.online_logical_cores; + ret = sys_convert_ranges_to_str(&g_pal_control->topo_info.online_logical_cores, str, + sizeof(str), ","); } else if (strcmp(name, "possible") == 0) { - str = g_pal_control->topo_info.possible_logical_cores; + ret = sys_convert_ranges_to_str(&g_pal_control->topo_info.possible_logical_cores, str, + sizeof(str), ","); } else { log_debug("unrecognized file: %s", name); - return -ENOENT; + ret = -ENOENT; } + if (ret < 0) + return ret; return sys_load(str, out_data, out_size); } @@ -37,27 +42,30 @@ int sys_cpu_load(struct shim_dentry* dent, char** out_data, size_t* out_size) { const char* name = dent->name; PAL_CORE_TOPO_INFO* core_topology = &g_pal_control->topo_info.core_topology[cpu_num]; - const char* str; - char buf[12]; + char str[PAL_SYSFS_MAP_FILESZ] = {'\0'}; if (strcmp(name, "online") == 0) { /* `cpu/cpuX/online` is not present for cpu0 */ if (cpu_num == 0) return -ENOENT; - str = core_topology->is_logical_core_online; + ret = sys_convert_int_to_sizestr(core_topology->is_logical_core_online, MULTIPLIER_NONE, + str, sizeof(str)); } else if (strcmp(name, "core_id") == 0) { - str = core_topology->core_id; + ret = sys_convert_int_to_sizestr(core_topology->core_id, MULTIPLIER_NONE, str, sizeof(str)); } else if (strcmp(name, "physical_package_id") == 0) { - snprintf(buf, sizeof(buf), "%d\n", g_pal_control->cpu_info.cpu_socket[cpu_num]); - str = buf; + ret = sys_convert_int_to_sizestr(core_topology->cpu_socket, MULTIPLIER_NONE, str, + sizeof(str)); } else if (strcmp(name, "core_siblings") == 0) { - str = core_topology->core_siblings; + ret = sys_convert_ranges_to_cpu_bitmap_str(&core_topology->core_siblings, str, sizeof(str)); } else if (strcmp(name, "thread_siblings") == 0) { - str = core_topology->thread_siblings; + ret = sys_convert_ranges_to_cpu_bitmap_str(&core_topology->thread_siblings, str, + sizeof(str)); } else { log_debug("unrecognized file: %s", name); - return -ENOENT; + ret = -ENOENT; } + if (ret < 0) + return ret; return sys_load(str, out_data, out_size); } diff --git a/LibOS/shim/src/fs/sys/fs.c b/LibOS/shim/src/fs/sys/fs.c index f7907ce820..11ff9bd24d 100644 --- a/LibOS/shim/src/fs/sys/fs.c +++ b/LibOS/shim/src/fs/sys/fs.c @@ -14,6 +14,119 @@ #include "shim_fs_pseudo.h" #include "stat.h" +int sys_convert_int_to_sizestr(uint64_t val, enum size_multiplier size_mult, char* str, + size_t buf_size) { + int ret = 0; + + switch (size_mult) { + case MULTIPLIER_KB: + ret = snprintf(str, buf_size, "%luK\n", val); + break; + case MULTIPLIER_MB: + ret = snprintf(str, buf_size, "%luM\n", val); + break; + case MULTIPLIER_GB: + ret = snprintf(str, buf_size, "%luG\n", val); + break; + default: + ret = snprintf(str, buf_size, "%lu\n", val); + break; + } + + if (ret >= (int)buf_size) + ret = -EOVERFLOW; + return ret; +} + +int sys_convert_ranges_to_str(const PAL_RES_RANGE_INFO* resource_range_info, char* str, + size_t buf_size, const char* sep) { + uint64_t range_cnt = resource_range_info->range_count; + size_t offset = 0; + for (uint64_t i = 0; i < range_cnt; i++) { + if (offset > buf_size) + return -ENOMEM; + + int ret; + if (resource_range_info->ranges[i].end == resource_range_info->ranges[i].start) { + ret = snprintf(str + offset, buf_size - offset, "%lu%s", + resource_range_info->ranges[i].start, (i + 1 == range_cnt) ? "\n" : sep); + } else { + ret = snprintf(str + offset, buf_size - offset, "%lu-%lu%s", + resource_range_info->ranges[i].start, resource_range_info->ranges[i].end, + (i + 1 == range_cnt) ? "\n" : sep); + } + + if (ret < 0) + return ret; + + /* Truncation has occurred */ + if ((size_t)ret >= buf_size) + return -EOVERFLOW; + + offset += ret; + } + return 0; +} + +int sys_convert_ranges_to_cpu_bitmap_str(const PAL_RES_RANGE_INFO* resource_range_info, char* str, + size_t buf_size) { + int ret; + + /* Extract cpumask from the ranges */ + uint64_t possible_cores = g_pal_control->topo_info.possible_logical_cores.resource_count; + uint64_t num_cpumask = BITS_TO_INTS(possible_cores); + uint32_t* bitmap = (uint32_t*)calloc(num_cpumask, sizeof(uint32_t)); + if (!bitmap) + return -ENOMEM; + + for (uint64_t i = 0; i < resource_range_info->range_count; i++) { + uint64_t start = resource_range_info->ranges[i].start; + uint64_t end = resource_range_info->ranges[i].end; + + for (uint64_t j = start; j <= end; j++) { + uint64_t index = j / BITS_IN_TYPE(int); + assert(index < num_cpumask); + + bitmap[index] |= 1U << (j % BITS_IN_TYPE(int)); + } + } + + /* Convert cpumask to strings */ + size_t offset = 0; + for (uint64_t j = num_cpumask; j > 0; j--) { + if (offset > buf_size) { + ret = -ENOMEM; + goto out; + } + + /* Linux doesn't print leading zeroes for systems with less than 32 cores, e.g. "fff" for + * 12 cores; we mimic this behavior. */ + if (possible_cores >= 32) { + ret = snprintf(str + offset, buf_size - offset, "%08x%s", bitmap[j-1], + (j-1 == 0) ? "\n" : ","); + } else { + ret = snprintf(str + offset, buf_size - offset, "%x%s", bitmap[j-1], + (j-1 == 0) ? "\n" : ","); + } + + if (ret < 0) + goto out; + + /* Truncation has occurred */ + if ((size_t)ret >= buf_size) { + ret = -EOVERFLOW; + goto out; + } + + offset += ret; + } + ret = 0; + +out: + free(bitmap); + return ret; +} + static int sys_resource(struct shim_dentry* parent, const char* name, unsigned int* out_num, readdir_callback_t callback, void* arg) { const char* parent_name = parent->name; @@ -22,10 +135,10 @@ static int sys_resource(struct shim_dentry* parent, const char* name, unsigned i const char* prefix; if (strcmp(parent_name, "node") == 0) { - pal_total = g_pal_control->topo_info.num_online_nodes; + pal_total = g_pal_control->topo_info.nodes.resource_count; prefix = "node"; } else if (strcmp(parent_name, "cpu") == 0) { - pal_total = g_pal_control->cpu_info.online_logical_cores; + pal_total = g_pal_control->topo_info.online_logical_cores.resource_count; prefix = "cpu"; } else if (strcmp(parent_name, "cache") == 0) { pal_total = g_pal_control->topo_info.num_cache_index; @@ -130,6 +243,7 @@ static void init_cpu_dir(struct pseudo_node* cpu) { pseudo_add_str(indexX, "type", &sys_cache_load); pseudo_add_str(indexX, "size", &sys_cache_load); pseudo_add_str(indexX, "coherency_line_size", &sys_cache_load); + pseudo_add_str(indexX, "number_of_sets", &sys_cache_load); pseudo_add_str(indexX, "physical_line_partition", &sys_cache_load); } diff --git a/LibOS/shim/src/fs/sys/node_info.c b/LibOS/shim/src/fs/sys/node_info.c index 32e596a44b..f80a0fa9c8 100644 --- a/LibOS/shim/src/fs/sys/node_info.c +++ b/LibOS/shim/src/fs/sys/node_info.c @@ -12,15 +12,20 @@ #include "shim_fs_pseudo.h" int sys_node_general_load(struct shim_dentry* dent, char** out_data, size_t* out_size) { + int ret; + const char* name = dent->name; - const char* str; - if (strcmp(name, "online") == 0) { - str = g_pal_control->topo_info.online_nodes; - } else { + char str[PAL_SYSFS_BUF_FILESZ] = {'\0'}; + + if (strcmp(name, "online") != 0) { log_debug("unrecognized file: %s", name); return -ENOENT; } + ret = sys_convert_ranges_to_str(&g_pal_control->topo_info.nodes, str, sizeof(str), ","); + if (ret < 0) + return ret; + return sys_load(str, out_data, out_size); } @@ -33,23 +38,29 @@ int sys_node_load(struct shim_dentry* dent, char** out_data, size_t* out_size) { const char* name = dent->name; PAL_NUMA_TOPO_INFO* numa_topology = &g_pal_control->topo_info.numa_topology[node_num]; - const char* str = NULL; + char str[PAL_SYSFS_MAP_FILESZ] = {'\0'}; if (strcmp(name, "cpumap" ) == 0) { - str = numa_topology->cpumap; + ret = sys_convert_ranges_to_cpu_bitmap_str(&numa_topology->cpumap, str, sizeof(str)); } else if (strcmp(name, "distance") == 0) { - str = numa_topology->distance; + ret = sys_convert_ranges_to_str(&numa_topology->distance, str, sizeof(str), " "); } else if (strcmp(name, "nr_hugepages") == 0) { const char* parent_name = dent->parent->name; if (strcmp(parent_name, "hugepages-2048kB") == 0) { - str = numa_topology->hugepages[HUGEPAGES_2M].nr_hugepages; + ret = sys_convert_int_to_sizestr(numa_topology->nr_hugepages[HUGEPAGES_2M], + MULTIPLIER_NONE, str, sizeof(str)); } else if (strcmp(parent_name, "hugepages-1048576kB") == 0) { - str = numa_topology->hugepages[HUGEPAGES_1G].nr_hugepages; + ret = sys_convert_int_to_sizestr(numa_topology->nr_hugepages[HUGEPAGES_1G], + MULTIPLIER_NONE, str, sizeof(str)); + } else { + log_debug("unrecognized hugepage file: %s", parent_name); + ret = -ENOENT; } - } - if (!str) { + } else { log_debug("unrecognized file: %s", name); - return -ENOENT; + ret = -ENOENT; } + if (ret < 0) + return ret; return sys_load(str, out_data, out_size); } diff --git a/LibOS/shim/src/sys/shim_sched.c b/LibOS/shim/src/sys/shim_sched.c index a673d84fb0..e53af9c899 100644 --- a/LibOS/shim/src/sys/shim_sched.c +++ b/LibOS/shim/src/sys/shim_sched.c @@ -175,7 +175,7 @@ long shim_do_sched_setaffinity(pid_t pid, unsigned int cpumask_size, unsigned lo long shim_do_sched_getaffinity(pid_t pid, unsigned int cpumask_size, unsigned long* user_mask_ptr) { int ret; - size_t cpu_cnt = g_pal_control->cpu_info.online_logical_cores; + size_t cpu_cnt = g_pal_control->topo_info.online_logical_cores.resource_count; /* Check if user_mask_ptr is valid */ if (!is_user_memory_writable(user_mask_ptr, cpumask_size)) diff --git a/LibOS/shim/test/regression/test_libos.py b/LibOS/shim/test/regression/test_libos.py index 03e6d22e00..732dee380a 100644 --- a/LibOS/shim/test/regression/test_libos.py +++ b/LibOS/shim/test/regression/test_libos.py @@ -886,6 +886,7 @@ def test_040_sysfs(self): self.assertIn(f'{cache}/type: file', lines) self.assertIn(f'{cache}/size: file', lines) self.assertIn(f'{cache}/coherency_line_size: file', lines) + self.assertIn(f'{cache}/number_of_sets: file', lines) self.assertIn(f'{cache}/physical_line_partition: file', lines) self.assertIn('/sys/devices/system/node: directory', lines) diff --git a/Pal/include/arch/x86_64/pal-arch.h b/Pal/include/arch/x86_64/pal-arch.h index f0f0ab64e0..062ff597d3 100644 --- a/Pal/include/arch/x86_64/pal-arch.h +++ b/Pal/include/arch/x86_64/pal-arch.h @@ -30,11 +30,11 @@ typedef struct pal_tcb PAL_TCB; #define STACK_PROTECTOR_CANARY_DEFAULT 0xbadbadbadbadUL -/* Used to represent plain integers (only numeric values) */ -#define PAL_SYSFS_INT_FILESZ 16 -/* Used to represent buffers having numeric values with text. E.g "1024576K" */ +/* Used to represent buffers having numeric values with text. E.g "1024576K". + * NOTE: Used to allocate on stack; increase with caution or use malloc instead. */ #define PAL_SYSFS_BUF_FILESZ 64 -/* Used to represent cpumaps like "00000000,ffffffff,00000000,ffffffff" */ +/* Used to represent cpumaps like "00000000,ffffffff,00000000,ffffffff". + * NOTE: Used to allocate on stack; increase with caution or use malloc instead. */ #define PAL_SYSFS_MAP_FILESZ 256 typedef struct pal_tcb { @@ -278,16 +278,21 @@ enum { HUGEPAGES_MAX, }; +enum size_multiplier { + MULTIPLIER_NONE, + MULTIPLIER_KB, + MULTIPLIER_MB, + MULTIPLIER_GB, +}; + +enum cache_type { + CACHE_TYPE_DATA, + CACHE_TYPE_INSTRUCTION, + CACHE_TYPE_UNIFIED, +}; + /* PAL_CPU_INFO holds /proc/cpuinfo data */ typedef struct PAL_CPU_INFO_ { - /* Number of logical cores available in the host */ - PAL_NUM online_logical_cores; - /* Max number of logical cores available in the host */ - PAL_NUM possible_logical_cores; - /* Number of physical cores in a socket (physical package) */ - PAL_NUM physical_cores_per_socket; - /* array of "logical core -> socket" mappings; has online_logical_cores elements */ - int* cpu_socket; PAL_STR cpu_vendor; PAL_STR cpu_brand; PAL_NUM cpu_family; @@ -297,44 +302,54 @@ typedef struct PAL_CPU_INFO_ { PAL_STR cpu_flags; } PAL_CPU_INFO; +typedef struct PAL_RANGE_INFO_ { + PAL_NUM start; + PAL_NUM end; +} PAL_RANGE_INFO; + +typedef struct PAL_RES_RANGE_INFO_ { + /* Count of total number of resources present. Eg. 0-63 will result in this count being 64 */ + PAL_NUM resource_count; + /* Count of total number of ranges present. Eg. 0-31,32-63 will result in this count being 2 */ + PAL_NUM range_count; + PAL_RANGE_INFO* ranges; +} PAL_RES_RANGE_INFO; + typedef struct PAL_CORE_CACHE_INFO_ { - char shared_cpu_map[PAL_SYSFS_MAP_FILESZ]; - char level[PAL_SYSFS_INT_FILESZ]; - char type[PAL_SYSFS_BUF_FILESZ]; - char size[PAL_SYSFS_BUF_FILESZ]; - char coherency_line_size[PAL_SYSFS_INT_FILESZ]; - char number_of_sets[PAL_SYSFS_INT_FILESZ]; - char physical_line_partition[PAL_SYSFS_INT_FILESZ]; + PAL_RES_RANGE_INFO shared_cpu_map; + PAL_NUM level; + PAL_NUM type; + PAL_NUM size; + enum size_multiplier size_multiplier; + PAL_NUM coherency_line_size; + PAL_NUM number_of_sets; + PAL_NUM physical_line_partition; } PAL_CORE_CACHE_INFO; typedef struct PAL_CORE_TOPO_INFO_ { /* [0] element is uninitialized because core 0 is always online */ - char is_logical_core_online[PAL_SYSFS_INT_FILESZ]; - char core_id[PAL_SYSFS_INT_FILESZ]; - char core_siblings[PAL_SYSFS_MAP_FILESZ]; - char thread_siblings[PAL_SYSFS_MAP_FILESZ]; + PAL_NUM is_logical_core_online; + PAL_NUM core_id; + PAL_RES_RANGE_INFO core_siblings; + PAL_RES_RANGE_INFO thread_siblings; PAL_CORE_CACHE_INFO* cache; /* Array of size num_cache_index, owned by this struct */ + PAL_NUM cpu_socket; /* Array of "logical core -> socket" mappings */ } PAL_CORE_TOPO_INFO; -typedef struct PAL_NUMA_HUGEPAGE_INFO_ { - char nr_hugepages[PAL_SYSFS_INT_FILESZ]; -} PAL_NUMA_HUGEPAGE_INFO; - typedef struct PAL_NUMA_TOPO_INFO_ { - char cpumap[PAL_SYSFS_MAP_FILESZ]; - char distance[PAL_SYSFS_BUF_FILESZ]; - PAL_NUMA_HUGEPAGE_INFO hugepages[HUGEPAGES_MAX]; + PAL_RES_RANGE_INFO cpumap; + PAL_RES_RANGE_INFO distance; + PAL_NUM nr_hugepages[HUGEPAGES_MAX]; } PAL_NUMA_TOPO_INFO; -/* This struct takes ~1.6KB. On a single socket, 4 logical core system, with 3 cache levels - * it would take ~8KB in memory. */ typedef struct PAL_TOPO_INFO_ { - char online_logical_cores[PAL_SYSFS_BUF_FILESZ]; - char possible_logical_cores[PAL_SYSFS_BUF_FILESZ]; - char online_nodes[PAL_SYSFS_BUF_FILESZ]; - /* Number of nodes available in the host */ - PAL_NUM num_online_nodes; - /* cache index corresponds to number of cache levels (such as L2 or L3) available on the host */ + PAL_RES_RANGE_INFO online_logical_cores; + PAL_RES_RANGE_INFO possible_logical_cores; + PAL_NUM num_sockets; + /* Number of physical cores in a socket (physical package) */ + PAL_NUM physical_cores_per_socket; + PAL_RES_RANGE_INFO nodes; + /* Index corresponds to number of cache levels (such as L2 or L3) available on the host */ PAL_NUM num_cache_index; PAL_CORE_TOPO_INFO* core_topology; /* Array of logical core topology info, owned by this struct */ PAL_NUMA_TOPO_INFO* numa_topology; /* Array of numa topology info, owned by this struct */ diff --git a/Pal/include/host/Linux-common/topo_info.h b/Pal/include/host/Linux-common/topo_info.h index 8440bff826..611ea514c0 100644 --- a/Pal/include/host/Linux-common/topo_info.h +++ b/Pal/include/host/Linux-common/topo_info.h @@ -9,13 +9,15 @@ #include "pal.h" /* Opens a pseudo-file describing HW resources such as online CPUs and counts the number of - * HW resources present in the file (if count == true) or simply reads the integer stored in the - * file (if count == false). For example on a single-core machine, calling this function on - * `/sys/devices/system/cpu/online` with count == true will return 1 and 0 with count == false. - * Returns UNIX error code on failure. + * HW resources present in the file (if count == true) and stores the result in `PAL_RES_RANGE_INFO` + * struct if provided or simply reads the integer stored in the file (if count == false). If + * `size_mult` is passed, then size qualifier is stored while reading the integer. For example on a + * single-core machine, calling this function on `/sys/devices/system/cpu/online` with count == true + * will return 1 and 0 with count == false. Returns UNIX error code on failure. * N.B: Understands complex formats like "1,3-5,6" when called with count == true. */ -int get_hw_resource(const char* filename, bool count); +int get_hw_resource(const char* filename, bool count, PAL_RES_RANGE_INFO* res_info, + enum size_multiplier* size_mult); /* Reads up to count bytes from the file into the buf passed. * Returns 0 or number of bytes read on success and UNIX error code on failure. diff --git a/Pal/regression/Bootstrap.c b/Pal/regression/Bootstrap.c index dd313d731d..63ec347a24 100644 --- a/Pal/regression/Bootstrap.c +++ b/Pal/regression/Bootstrap.c @@ -34,7 +34,7 @@ int main(int argc, char** argv, char** envp) { pal_control.user_address.start < pal_control.user_address.end) pal_printf("User Address Range OK\n"); - pal_printf("CPU num: %ld\n", pal_control.cpu_info.online_logical_cores); + pal_printf("CPU num: %ld\n", pal_control.topo_info.online_logical_cores.resource_count); pal_printf("CPU vendor: %s\n", pal_control.cpu_info.cpu_vendor); pal_printf("CPU brand: %s\n", pal_control.cpu_info.cpu_brand); pal_printf("CPU family: %ld\n", pal_control.cpu_info.cpu_family); diff --git a/Pal/src/host/Linux-SGX/db_main.c b/Pal/src/host/Linux-SGX/db_main.c index 6651fb420a..c17b0ffd3d 100644 --- a/Pal/src/host/Linux-SGX/db_main.c +++ b/Pal/src/host/Linux-SGX/db_main.c @@ -133,317 +133,457 @@ static const char** make_argv_list(void* uptr_src, size_t src_size) { return NULL; } -/* This function extracts first positive integer present in the buffer. For example 31 will be - * returned when input "31" is provided. If buffer contains valid size indicators such as "48K", - * then just numeric value (48 in this case) is returned. Returns negative unix error code if the - * buffer is malformed E.g., "20abc" or "3,4,5" or "xyz123" or "512H". - * Use case: To extract integer from /sys/devices/system/cpu/cpuX/cache/index0/size path. */ -static long extract_long_from_buffer(const char* buf) { - const char* end = NULL; - unsigned long intval; - - while (*buf == ' ' || *buf == '\t') - buf++; - - /* Intentionally using unsigned long to adapt for variable bitness. */ - if (str_to_ulong(buf, 10, &intval, &end) < 0 || intval > LONG_MAX) - return -EINVAL; - - if (end[0] != '\0') { - if (end[0] != '\n' && end[0] != 'K' && end[0] != 'M' && end[0] != 'G') - return -EINVAL; - - end += 1; - if (end[0] != '\0' && end[0] != '\n' && end[1] != '\0') - return -EINVAL; - } - return (long)intval; +/* All sanitization/helper functions below do not free memory. We simply exit on failure. */ + +static int copy_hw_resource_range(PAL_RES_RANGE_INFO* src, PAL_RES_RANGE_INFO* dest) { + uint64_t range_cnt = src->range_count; + PAL_RANGE_INFO* ranges = (PAL_RANGE_INFO*)malloc(range_cnt * sizeof(PAL_RANGE_INFO)); + if (!ranges) { + log_error("Range allocation failed"); + return -1; + } + + if (!sgx_copy_to_enclave(ranges, range_cnt * sizeof(PAL_RANGE_INFO), src->ranges, + range_cnt * sizeof(PAL_RANGE_INFO))) { + log_error("Copying ranges into the enclave failed"); + return -1; + } + + dest->ranges = ranges; + dest->range_count = range_cnt; + dest->resource_count = src->resource_count; + return 0; } -/* This function counts bits set in buffer. For example 2 will be returned when input buffer - * "00000000,80000000,00000000,80000000" is provided. Returns negative UNIX error code on error and - * actual count on success. - * Use case: To count bits set in /sys/devices/system/cpu/cpu95/topology/core_siblings bitmaps. */ -static long count_bits_set_from_resource_map(const char* buf) { - unsigned long count = 0; - unsigned long bitmap; - while (*buf) { - while (*buf == ' ' || *buf == '\t' || *buf == ',' || *buf == '\n') - buf++; +/* This function does the following 3 sanitizations for a given resource range: + * 1. Ensures the resource as well as range count doesn't exceed limits. + * 2. Ensures that ranges don't overlap like "1-5, 3-4". + * 3. Ensures the ranges aren't malformed like "1-5, 7-1". + * Returns -1 error on failure and 0 on success. + */ +static int sanitize_hw_resource_range(PAL_RES_RANGE_INFO* res_info, uint64_t res_min_limit, + uint64_t res_max_limit, uint64_t range_min_limit, + uint64_t range_max_limit) { + uint64_t resource_count = res_info->resource_count; + if (!IS_IN_RANGE_INCL(resource_count, res_min_limit, res_max_limit)) { + log_error("Invalid resource count: %lu", resource_count); + return -1; + } - if (*buf == '\0') - break; + uint64_t range_count = res_info->range_count; + if (!IS_IN_RANGE_INCL(range_count, 1, 1 << 7)) { + log_error("Invalid range count: %lu", range_count); + return -1; + } - const char* end = NULL; - /* Linux uses different bitmap size depending on the host arch. We intentionally use - * unsigned long to adapt for this variable bitness. */ - if (str_to_ulong(buf, 16, &bitmap, &end) < 0) - return -EINVAL; + if (!res_info->ranges) + return -1; - if (*end != '\0' && *end != ',' && *end != '\n') - return -EINVAL; + bool check_for_overlaps = false; + uint64_t previous_end = 0; + for (uint64_t i = 0; i < range_count; i++) { - count += count_ulong_bits_set(bitmap); - if (count > LONG_MAX) - return -EINVAL; + uint64_t start = res_info->ranges[i].start; + uint64_t end = res_info->ranges[i].end; - buf = end; - } - return (long)count; -} + /* Ensure start and end fall within range limits */ + if (!IS_IN_RANGE_INCL(start, range_min_limit, range_max_limit)) { + log_error("Invalid start range: %lu", start); + return -1; + } -/* This function counts number of hw resources present in buffer. There are 2 options available, - * 1) ordered == true, which ensures that buffer doesn't have overlapping range like "1-5,3-4" or - * malformed like "1-5,7-1". - * 2) ordered == false which simply counts the range of numbers. For example "1-5, 3-4, 7-1" will - * return 14 as count. - * Returns negative unix error if buf is empty or contains invalid data and number of hw resources - * present in the buffer on success. */ -static long sanitize_hw_resource_count(const char* buf, bool ordered) { - bool init_done = false; - unsigned long current_maxint = 0; - unsigned long resource_cnt = 0; - while (*buf) { - while (*buf == ' ' || *buf == '\t' || *buf == ',' || *buf == '\n') - buf++; - - if (*buf == '\0') - break; - - const char* end = NULL; - unsigned long firstint; - /* Intentionally using unsigned long to adapt for variable bitness. */ - if (str_to_ulong(buf, 10, &firstint, &end) < 0 || firstint > LONG_MAX) - return -EINVAL; - - if (ordered) { - if (init_done && firstint <= current_maxint) - return -EINVAL; - current_maxint = firstint; - init_done = true; + if ((start != end) && !IS_IN_RANGE_INCL(end, start + 1, range_max_limit)) { + log_error("Invalid end range: %lu", end); + return -1; } - /* count the number of HW resources */ - if (*end == '\0' || *end == ',' || *end == '\n' || *end == ' ') { - /* single HW resource index, count as one more */ - resource_cnt++; - } else if (*end == '-') { - /* HW resource range, count how many HW resources are in range */ - buf = end + 1; - unsigned long secondint; - if (str_to_ulong(buf, 10, &secondint, &end) < 0 || secondint > LONG_MAX) - return -EINVAL; - - unsigned long diff; - if (secondint > firstint) { - if (ordered) - current_maxint = secondint; - - diff = secondint - firstint; - if (diff >= LONG_MAX || resource_cnt + diff + 1 > LONG_MAX) - return -EINVAL; - resource_cnt += diff + 1; /* inclusive (e.g. 0-7) */ - } else { - diff = firstint - secondint; - if (ordered || diff >= LONG_MAX || resource_cnt + diff + 1 > LONG_MAX) - return -EINVAL; - resource_cnt += diff + 1; - } + /* check for overlaps like "1-5, 3-4". Note: we skip this check for first time as + *`previous_end` is not yet initializied. */ + if (check_for_overlaps && previous_end >= start) { + log_error("Malformed range: previous_end = %lu, current start = %lu", previous_end, + start); + return -1; } - buf = end; + previous_end = end; + + /* Start checking for overlaps after the first range */ + check_for_overlaps = true; } - return (long)resource_cnt ?: -EINVAL; + + return 0; } -static int sanitize_cache_topology_info(PAL_CORE_CACHE_INFO* cache, int64_t cache_lvls, - int64_t num_cores) { - for (int64_t lvl = 0; lvl < cache_lvls; lvl++) { - int64_t shared_cpu_map = count_bits_set_from_resource_map(cache[lvl].shared_cpu_map); - if (!IS_IN_RANGE_INCL(shared_cpu_map, 1, num_cores)) - return -EINVAL; +static int sanitize_cache_topology_info(PAL_CORE_CACHE_INFO* cache, uint64_t cache_lvls, + uint64_t num_cores) { + for (uint64_t lvl = 0; lvl < cache_lvls; lvl++) { + if (cache[lvl].type != CACHE_TYPE_DATA && cache[lvl].type != CACHE_TYPE_INSTRUCTION && + cache[lvl].type != CACHE_TYPE_UNIFIED) { + return -1; + } - int64_t level = extract_long_from_buffer(cache[lvl].level); + uint64_t max_limit; + if (cache[lvl].type == CACHE_TYPE_DATA || cache[lvl].type == CACHE_TYPE_INSTRUCTION) { + max_limit = 2; /* Taking HT into account */ + } else { + /* if unified cache then it can range up to total number of cores. */ + max_limit = num_cores; + } + + int ret = sanitize_hw_resource_range(&cache[lvl].shared_cpu_map, 1, max_limit, 0, + num_cores); + if (ret < 0) { + log_error("Invalid cache[%lu].shared_cpu_map", lvl); + return -1; + } + + uint64_t level = cache[lvl].level; if (!IS_IN_RANGE_INCL(level, 1, 3)) /* x86 processors have max of 3 cache levels */ - return -EINVAL; + return -1; - char* type = cache[lvl].type; - if (!strstartswith(type, "Data") && !strstartswith(type, "Instruction") && - !strstartswith(type, "Unified")) { - return -EINVAL; + if (cache[lvl].size_multiplier != MULTIPLIER_KB && + cache[lvl].size_multiplier != MULTIPLIER_MB && + cache[lvl].size_multiplier != MULTIPLIER_GB && + cache[lvl].size_multiplier != MULTIPLIER_NONE) { + return -1; } - int64_t size = extract_long_from_buffer(cache[lvl].size); - if (!IS_IN_RANGE_INCL(size, 1, 1 << 30)) - return -EINVAL; + uint64_t multiplier = 1; + if (cache[lvl].size_multiplier == MULTIPLIER_KB) + multiplier = 1024; + else if (cache[lvl].size_multiplier == MULTIPLIER_MB) + multiplier = 1024 * 1024; + else if (cache[lvl].size_multiplier == MULTIPLIER_GB) + multiplier = 1024 * 1024 * 1024; + + uint64_t cache_size; + if (__builtin_mul_overflow(cache[lvl].size, multiplier, &cache_size)) + return -1; - int64_t coherency_line_size = extract_long_from_buffer(cache[lvl].coherency_line_size); + if (!IS_IN_RANGE_INCL(cache_size, 1, 1 << 30)) + return -1; + + uint64_t coherency_line_size = cache[lvl].coherency_line_size; if (!IS_IN_RANGE_INCL(coherency_line_size, 1, 1 << 16)) - return -EINVAL; + return -1; - int64_t number_of_sets = extract_long_from_buffer(cache[lvl].number_of_sets); + uint64_t number_of_sets = cache[lvl].number_of_sets; if (!IS_IN_RANGE_INCL(number_of_sets, 1, 1 << 30)) - return -EINVAL; + return -1; - int64_t physical_line_partition = - extract_long_from_buffer(cache[lvl].physical_line_partition); + uint64_t physical_line_partition = cache[lvl].physical_line_partition; if (!IS_IN_RANGE_INCL(physical_line_partition, 1, 1 << 16)) - return -EINVAL; + return -1; } return 0; } -static int sanitize_core_topology_info(PAL_CORE_TOPO_INFO* core_topology, int64_t num_cores, - int64_t cache_lvls) { +static int sanitize_core_topology_info(PAL_CORE_TOPO_INFO* core_topology, uint64_t num_cores, + uint64_t cache_lvls) { + int ret; if (num_cores == 0 || cache_lvls == 0) - return -ENOENT; + return -1; - for (int64_t idx = 0; idx < num_cores; idx++) { + for (uint64_t idx = 0; idx < num_cores; idx++) { if (idx != 0) { /* core 0 is always online */ - int64_t is_core_online = - extract_long_from_buffer(core_topology[idx].is_logical_core_online); + uint64_t is_core_online = core_topology[idx].is_logical_core_online; if (is_core_online != 0 && is_core_online != 1) - return -EINVAL; + return -1; } - int64_t core_id = extract_long_from_buffer(core_topology[idx].core_id); - if (!IS_IN_RANGE_INCL(core_id, 0, num_cores - 1)) - return -EINVAL; + uint64_t core_id = core_topology[idx].core_id; + if (core_id > num_cores - 1) + return -1; - int64_t core_siblings = count_bits_set_from_resource_map(core_topology[idx].core_siblings); - if (!IS_IN_RANGE_INCL(core_siblings, 1, num_cores)) - return -EINVAL; + ret = sanitize_hw_resource_range(&core_topology[idx].core_siblings, 1, num_cores, 0, + num_cores); + if (ret < 0) { + log_error("Invalid core_topology[%lu].core_siblings", idx); + return -1; + } - int64_t thread_siblings = - count_bits_set_from_resource_map(core_topology[idx].thread_siblings); - if (!IS_IN_RANGE_INCL(thread_siblings, 1, 4)) /* x86 processors have max of 4 SMT siblings */ - return -EINVAL; + /* Max. SMT siblings currently supported on x86 processors is 4 */ + ret = sanitize_hw_resource_range(&core_topology[idx].thread_siblings, 1, 4, 0, num_cores); + if (ret < 0) { + log_error("Invalid core_topology[%lu].thread_siblings", idx); + return -1; + } if (sanitize_cache_topology_info(core_topology[idx].cache, cache_lvls, num_cores) < 0) - return -EINVAL; + return -1; } return 0; } -static int sanitize_socket_info(int* cpu_socket, int64_t num_cores) { - if (num_cores == 0) - return -ENOENT; +static int sanitize_socket_info(PAL_TOPO_INFO* topo_info) { + int ret = 0; + uint64_t prev_socket = UINT32_MAX; + + uint64_t num_sockets = topo_info->num_sockets; + PAL_RES_RANGE_INFO* socket_info = + (PAL_RES_RANGE_INFO*)calloc(num_sockets, sizeof(PAL_RES_RANGE_INFO)); + if (!socket_info) + return -1; + + uint64_t num_cores = topo_info->online_logical_cores.resource_count; + for (uint64_t idx = 0; idx < num_cores; idx++) { + uint64_t socket = (int64_t)topo_info->core_topology[idx].cpu_socket; + if (socket > num_sockets - 1) { + ret = -1; + goto out_socket; + } + + /* Extract cores that are part of each socket to validate against core_siblings. + * Note: Although a clever attacker might modify both of these values, idea here is to + * provide a consistent view of the topology. */ + if (socket != prev_socket) { + socket_info[socket].range_count++; + size_t new_sz = sizeof(PAL_RANGE_INFO) * socket_info[socket].range_count; + size_t old_sz = new_sz - sizeof(PAL_RANGE_INFO); + void* tmp = malloc(new_sz); + if (!tmp) { + ret = -1; + goto out_socket; + } - for (int64_t idx = 0; idx < num_cores; idx++) { - /* Virtual environments such as QEMU may assign each core to a separate socket/package with - * one or more NUMA nodes. So we check against the number of online logical cores. */ - if (!IS_IN_RANGE_INCL(cpu_socket[idx], 0, num_cores - 1)) - return -EINVAL; + if (socket_info[socket].ranges) { + memcpy(tmp, socket_info[socket].ranges, old_sz); + free(socket_info[socket].ranges); + } + socket_info[socket].ranges = tmp; + + uint64_t range_idx = socket_info[socket].range_count - 1; + socket_info[socket].ranges[range_idx].start = idx; + socket_info[socket].ranges[range_idx].end = idx; + prev_socket = socket; + } else { + uint64_t range_idx = socket_info[socket].range_count - 1; + socket_info[socket].ranges[range_idx].end = idx; + } } - return 0; -} -static int sanitize_numa_topology_info(PAL_NUMA_TOPO_INFO* numa_topology, int64_t num_nodes, - int64_t num_cores) { - if (num_nodes == 0 || num_cores == 0) - return -ENOENT; + /* core-siblings represent all the cores that are part of a socket. We cross-verify the + * socket info against this. */ + for (uint64_t idx = 0; idx < num_sockets; idx++) { + if (!socket_info[idx].range_count || !socket_info[idx].ranges) { + ret = -1; + goto out_socket; + } + + uint64_t core_in_socket = socket_info[idx].ranges[0].start; + uint64_t core_sibling_cnt = + topo_info->core_topology[core_in_socket].core_siblings.range_count; + + if (core_sibling_cnt != socket_info[idx].range_count) { + ret = -1; + goto out_socket; + } - for (int64_t idx = 0; idx < num_nodes; idx++) { - int64_t cpumap = count_bits_set_from_resource_map(numa_topology[idx].cpumap); - if (!IS_IN_RANGE_INCL(cpumap, 1, num_cores)) - return -EINVAL; + PAL_RANGE_INFO* core_sibling_ranges = + topo_info->core_topology[core_in_socket].core_siblings.ranges; + for (uint64_t j = 0; j < core_sibling_cnt; j++) { + if (socket_info[idx].ranges[j].start != core_sibling_ranges[j].start || + socket_info[idx].ranges[j].end != core_sibling_ranges[j].end) { + ret = -1; + goto out_socket; + } + } + } - if (num_nodes != sanitize_hw_resource_count(numa_topology[idx].distance, /*ordered=*/false)) - return -EINVAL; + ret = 0; +out_socket: + for (uint64_t i = 0; i < num_sockets; i++) { + if (socket_info[i].ranges) + free(socket_info[i].ranges); } - return 0; + free(socket_info); + return ret; } -/* This function doesn't clean up resources on failure, assuming that we terminate right away in - * such case. */ -static int parse_host_topo_info(struct pal_sec* sec_info) { - if (sec_info->online_logical_cores > INT64_MAX) - return -1; - int64_t online_logical_cores = (int64_t)sec_info->online_logical_cores; - if (!IS_IN_RANGE_INCL(online_logical_cores, 1, 1 << 16)) { - log_error("Invalid sec_info.online_logical_cores: %ld", online_logical_cores); +static int sanitize_numa_topology_info(PAL_NUMA_TOPO_INFO* numa_topology, uint64_t num_nodes, + uint64_t num_cores, uint64_t possible_cores) { + int ret = 0; + uint64_t num_cpumask = BITS_TO_INTS(possible_cores); + unsigned int* bitmap = (unsigned int*)calloc(num_cpumask, sizeof(unsigned int)); + if (!bitmap) return -1; + + uint64_t total_cores_in_numa = 0; + for (uint64_t idx = 0; idx < num_nodes; idx++) { + ret = sanitize_hw_resource_range(&numa_topology[idx].cpumap, 1, num_cores, 0, num_cores); + if (ret < 0) { + log_error("Invalid numa_topology[%lu].cpumap", idx); + goto out_numa; + } + + /* Ensure that each NUMA has unique cores */ + for (uint64_t i = 0; i < numa_topology[idx].cpumap.range_count; i++) { + uint64_t start = numa_topology[idx].cpumap.ranges[i].start; + uint64_t end = numa_topology[idx].cpumap.ranges[i].end; + for (uint64_t j = start; j <= end; j++) { + uint64_t index = j / BITS_IN_TYPE(int); + assert(index < num_cpumask); + + if (bitmap[index] & (1U << (j % BITS_IN_TYPE(int)))) { + log_error("Invalid numa_topology: Core %lu found in multiple numa nodes", j); + ret = -1; + goto out_numa; + } + bitmap[index] |= 1U << (j % BITS_IN_TYPE(int)); + total_cores_in_numa++; + } + } + + uint64_t distances = numa_topology[idx].distance.resource_count; + if (distances != num_nodes) { + log_error("Invalid numa_topology[%lu].distance", idx); + ret = -1; + goto out_numa; + } } - g_pal_sec.online_logical_cores = online_logical_cores; - if (online_logical_cores != sanitize_hw_resource_count(sec_info->topo_info.online_logical_cores, - /*ordered=*/true)) { - log_error("Invalid sec_info.topo_info.online_logical_cores"); - return -1; + if (total_cores_in_numa != num_cores) { + log_error("Invalid numa_topology: more cores in NUMA than online"); + ret = -1; + goto out_numa; } - COPY_ARRAY(g_pal_sec.topo_info.online_logical_cores, sec_info->topo_info.online_logical_cores); - if (sec_info->possible_logical_cores > INT64_MAX) - return -1; - int64_t possible_logical_cores = (int64_t)sec_info->possible_logical_cores; - if (!IS_IN_RANGE_INCL(possible_logical_cores, 1, 1 << 16)) { - log_error("Invalid sec_info.possible_logical_cores: %ld", possible_logical_cores); +out_numa: + free(bitmap); + return ret; +} + +static int sgx_copy_core_topo_to_enclave(PAL_CORE_TOPO_INFO* src, uint64_t online_logical_cores, + uint64_t num_cache_index) { + PAL_CORE_TOPO_INFO* core_topo = (PAL_CORE_TOPO_INFO*)malloc(online_logical_cores * + sizeof(PAL_CORE_TOPO_INFO)); + if (!core_topo) { + log_error("Allocation for core topology failed"); return -1; } - g_pal_sec.possible_logical_cores = possible_logical_cores; - if (possible_logical_cores != - sanitize_hw_resource_count(sec_info->topo_info.possible_logical_cores, /*ordered=*/true)) { - log_error("Invalid sec_info.topo_info.possible_logical_cores"); - return -1; + for (uint64_t idx = 0; idx < online_logical_cores; idx++) { + core_topo[idx].is_logical_core_online = src[idx].is_logical_core_online; + core_topo[idx].core_id = src[idx].core_id; + core_topo[idx].cpu_socket = src[idx].cpu_socket; + + int ret = copy_hw_resource_range(&src[idx].core_siblings, &core_topo[idx].core_siblings); + if (ret < 0) { + log_error("Copying core_topo[%lu].core_siblings failed", idx); + return -1; + } + + ret = copy_hw_resource_range(&src[idx].thread_siblings, &core_topo[idx].thread_siblings); + if (ret < 0) { + log_error("Copying core_topo[%lu].core_siblings failed", idx); + return -1; + } + + /* Allocate enclave memory to store cache info */ + PAL_CORE_CACHE_INFO* cache_info = (PAL_CORE_CACHE_INFO*)malloc(num_cache_index * + sizeof(PAL_CORE_CACHE_INFO)); + if (!cache_info) { + log_error("Allocation for cache_info failed"); + return -1; + } + + for (uint64_t lvl = 0; lvl < num_cache_index; lvl++) { + cache_info[lvl].level = src[idx].cache[lvl].level; + cache_info[lvl].type = src[idx].cache[lvl].type; + cache_info[lvl].size = src[idx].cache[lvl].size; + cache_info[lvl].size_multiplier = src[idx].cache[lvl].size_multiplier; + cache_info[lvl].coherency_line_size = src[idx].cache[lvl].coherency_line_size; + cache_info[lvl].number_of_sets = src[idx].cache[lvl].number_of_sets; + cache_info[lvl].physical_line_partition = src[idx].cache[lvl].physical_line_partition; + + ret = copy_hw_resource_range(&src[idx].cache[lvl].shared_cpu_map, + &cache_info[lvl].shared_cpu_map); + if (ret < 0) { + log_error("Copying core_topo[%lu].cache[%lu].shared_cpu_map failed", idx, lvl); + return -1; + } + } + core_topo[idx].cache = cache_info; } - COPY_ARRAY(g_pal_sec.topo_info.possible_logical_cores, - sec_info->topo_info.possible_logical_cores); + g_pal_sec.topo_info.core_topology = core_topo; + + return 0; +} - if (!IS_IN_RANGE_INCL(sec_info->physical_cores_per_socket, 1, 1 << 13)) { - log_error("Invalid sec_info.physical_cores_per_socket: %ld", - sec_info->physical_cores_per_socket); +static int sgx_copy_numa_topo_to_enclave(PAL_NUMA_TOPO_INFO* src, uint64_t num_online_nodes) { + PAL_NUMA_TOPO_INFO* numa_topo = (PAL_NUMA_TOPO_INFO*)malloc(num_online_nodes * + sizeof(PAL_NUMA_TOPO_INFO)); + if (!numa_topo) { + log_error("Allocation for numa topology failed"); return -1; } - g_pal_sec.physical_cores_per_socket = sec_info->physical_cores_per_socket; - if (sec_info->topo_info.num_online_nodes > INT64_MAX) - return -1; - int64_t num_online_nodes = (int64_t)sec_info->topo_info.num_online_nodes; - if (!IS_IN_RANGE_INCL(num_online_nodes, 1, 1 << 8)) { - log_error("Invalid sec_info.topo_info.num_online_nodes: %ld", num_online_nodes); - return -1; + for (uint64_t idx = 0; idx < num_online_nodes; idx++) { + numa_topo[idx].nr_hugepages[HUGEPAGES_2M] = src[idx].nr_hugepages[HUGEPAGES_2M]; + numa_topo[idx].nr_hugepages[HUGEPAGES_1G] = src[idx].nr_hugepages[HUGEPAGES_1G]; + + int ret = copy_hw_resource_range(&src[idx].cpumap, &numa_topo[idx].cpumap); + if (ret < 0) { + log_error("Copying numa_topo[%lu].core_siblings failed", idx); + return -1; + } + + ret = copy_hw_resource_range(&src[idx].distance, &numa_topo[idx].distance); + if (ret < 0) { + log_error("Copying numa_topo[%lu].core_siblings failed", idx); + return -1; + } } - g_pal_sec.topo_info.num_online_nodes = num_online_nodes; + g_pal_sec.topo_info.numa_topology = numa_topo; - if (num_online_nodes != sanitize_hw_resource_count(sec_info->topo_info.online_nodes, - /*ordered=*/true)) { - log_error("Invalid sec_info.topo_info.online_nodes"); + return 0; +} + +static int parse_host_topo_info(PAL_TOPO_INFO* topo_info) { + int ret = sanitize_hw_resource_range(&topo_info->online_logical_cores, 1, 1 << 16, 0, 1 << 16); + if (ret < 0) { + log_error("Invalid sec_info.topo_info.online_logical_cores"); + return -1; + } + ret = copy_hw_resource_range(&topo_info->online_logical_cores, + &g_pal_sec.topo_info.online_logical_cores); + if (ret < 0) { + log_error("Copying sec_info.topo_info.online_logical_cores failed"); return -1; } - COPY_ARRAY(g_pal_sec.topo_info.online_nodes, sec_info->topo_info.online_nodes); - if (sec_info->topo_info.num_cache_index > INT64_MAX) + ret = sanitize_hw_resource_range(&topo_info->possible_logical_cores, 1, 1 << 16, 0, 1 << 16); + if (ret < 0) { + log_error("Invalid sec_info.topo_info.possible_logical_cores"); return -1; - int64_t num_cache_index = (int64_t)sec_info->topo_info.num_cache_index; - if (!IS_IN_RANGE_INCL(num_cache_index, 1, 1 << 4)) { - log_error("Invalid sec_info.topo_info.num_cache_index: %ld", num_cache_index); + } + ret = copy_hw_resource_range(&topo_info->possible_logical_cores, + &g_pal_sec.topo_info.possible_logical_cores); + if (ret < 0) { + log_error("Copying sec_info.topo_info.possible_logical_cores failed"); return -1; } - g_pal_sec.topo_info.num_cache_index = num_cache_index; - /* Sanitize logical core -> socket mappings */ - int ret = sanitize_socket_info(sec_info->cpu_socket, online_logical_cores); + ret = sanitize_hw_resource_range(&topo_info->nodes, 1, 1 << 16, 0, 1 << 16); if (ret < 0) { - log_error("Sanitization of logical core -> socket mappings failed"); + log_error("Invalid sec_info.topo_info.nodes"); return -1; } - - /* Allocate enclave memory to store "logical core -> socket" mappings */ - int* cpu_socket = (int*)malloc(online_logical_cores * sizeof(int)); - if (!cpu_socket) { - log_error("Allocation for logical core -> socket mappings failed"); + ret = copy_hw_resource_range(&topo_info->nodes, &g_pal_sec.topo_info.nodes); + if (ret < 0) { + log_error("Copying sec_info.topo_info.nodes failed"); return -1; } - if (!sgx_copy_to_enclave(cpu_socket, online_logical_cores * sizeof(int), sec_info->cpu_socket, - online_logical_cores * sizeof(int))) { - log_error("Copying cpu_socket into the enclave failed"); + uint64_t num_cache_index = topo_info->num_cache_index; + if (!IS_IN_RANGE_INCL(num_cache_index, 1, 1 << 4)) { + log_error("Invalid sec_info.topo_info.num_cache_index: %lu", num_cache_index); return -1; } - g_pal_sec.cpu_socket = cpu_socket; + g_pal_sec.topo_info.num_cache_index = num_cache_index; /* Sanitize core topology information */ - ret = sanitize_core_topology_info(sec_info->topo_info.core_topology, online_logical_cores, + uint64_t online_logical_cores = g_pal_sec.topo_info.online_logical_cores.resource_count; + ret = sanitize_core_topology_info(topo_info->core_topology, online_logical_cores, num_cache_index); if (ret < 0) { log_error("Sanitization of core_topology failed"); @@ -451,60 +591,52 @@ static int parse_host_topo_info(struct pal_sec* sec_info) { } /* Allocate enclave memory to store core topology info */ - PAL_CORE_TOPO_INFO* core_topology = (PAL_CORE_TOPO_INFO*)malloc(online_logical_cores * - sizeof(PAL_CORE_TOPO_INFO)); - if (!core_topology) { - log_error("Allocation for core topology failed"); + ret = sgx_copy_core_topo_to_enclave(topo_info->core_topology, online_logical_cores, + num_cache_index); + if (ret < 0) { + log_error("Copying core_topology into the enclave failed"); return -1; } - if (!sgx_copy_to_enclave(core_topology, online_logical_cores * sizeof(PAL_CORE_TOPO_INFO), - sec_info->topo_info.core_topology, - online_logical_cores * sizeof(PAL_CORE_TOPO_INFO))) { - log_error("Copying core_topology into the enclave failed"); + if (!IS_IN_RANGE_INCL(topo_info->physical_cores_per_socket, 1, 1 << 13)) { + log_error("Invalid sec_info.physical_cores_per_socket: %lu", + topo_info->physical_cores_per_socket); return -1; } + g_pal_sec.topo_info.physical_cores_per_socket = topo_info->physical_cores_per_socket; - /* Allocate enclave memory to store cache info */ - PAL_CORE_CACHE_INFO* cache_info = (PAL_CORE_CACHE_INFO*)malloc(num_cache_index * - sizeof(PAL_CORE_CACHE_INFO)); - if (!cache_info) { - log_error("Allocation for cache_info failed"); + uint64_t num_sockets = topo_info->num_sockets; + /* Virtual environments such as QEMU may assign each core to a separate socket/package with + * one or more NUMA nodes. So we check against the number of online logical cores. */ + if (!IS_IN_RANGE_INCL(num_sockets, 1, online_logical_cores)) { + log_error("Invalid sec_info.topo_info.num_cache_index: %lu", num_cache_index); return -1; } + g_pal_sec.topo_info.num_sockets = num_sockets; - if (!sgx_copy_to_enclave(cache_info, num_cache_index * sizeof(PAL_CORE_CACHE_INFO), - sec_info->topo_info.core_topology->cache, - num_cache_index * sizeof(PAL_CORE_CACHE_INFO))) { - log_error("Copying cache_info into the enclave failed"); + /* Sanitize logical core -> socket mappings */ + ret = sanitize_socket_info(topo_info); + if (ret < 0) { + log_error("Sanitization of logical core -> socket mappings failed"); return -1; } - core_topology->cache = cache_info; - g_pal_sec.topo_info.core_topology = core_topology; /* Sanitize numa topology information */ - ret = sanitize_numa_topology_info(sec_info->topo_info.numa_topology, num_online_nodes, - online_logical_cores); + uint64_t possible_cores = g_pal_sec.topo_info.possible_logical_cores.resource_count; + uint64_t num_online_nodes = g_pal_sec.topo_info.nodes.resource_count; + ret = sanitize_numa_topology_info(topo_info->numa_topology, num_online_nodes, + online_logical_cores, possible_cores); if (ret < 0) { log_error("Sanitization of numa_topology failed"); return -1; } /* Allocate enclave memory to store numa topology info */ - PAL_NUMA_TOPO_INFO* numa_topology = (PAL_NUMA_TOPO_INFO*)malloc(num_online_nodes * - sizeof(PAL_NUMA_TOPO_INFO)); - if (!numa_topology) { - log_error("Allocation for numa topology failed"); - return -1; - } - - if (!sgx_copy_to_enclave(numa_topology, num_online_nodes * sizeof(PAL_NUMA_TOPO_INFO), - sec_info->topo_info.numa_topology, - num_online_nodes * sizeof(PAL_NUMA_TOPO_INFO))) { + ret = sgx_copy_numa_topo_to_enclave(topo_info->numa_topology, num_online_nodes); + if (ret < 0) { log_error("Copying numa_topology into the enclave failed"); return -1; } - g_pal_sec.topo_info.numa_topology = numa_topology; return 0; } @@ -644,7 +776,7 @@ noreturn void pal_linux_main(char* uptr_libpal_uri, size_t libpal_uri_len, char* (void)get_tsc(); /* must be after `ready_for_exceptions=1` since it may generate SIGILL */ /* Now that enclave memory is set up, parse and store host topology info into g_pal_sec struct */ - ret = parse_host_topo_info(&sec_info); + ret = parse_host_topo_info(&sec_info.topo_info); if (ret < 0) ocall_exit(1, /*is_exitgroup=*/true); diff --git a/Pal/src/host/Linux-SGX/db_misc.c b/Pal/src/host/Linux-SGX/db_misc.c index 5776c4ea76..8243867593 100644 --- a/Pal/src/host/Linux-SGX/db_misc.c +++ b/Pal/src/host/Linux-SGX/db_misc.c @@ -728,10 +728,6 @@ int _DkGetCPUInfo(PAL_CPU_INFO* ci) { brand[BRAND_SIZE - 1] = '\0'; ci->cpu_brand = brand; - ci->online_logical_cores = g_pal_sec.online_logical_cores; - ci->physical_cores_per_socket = g_pal_sec.physical_cores_per_socket; - ci->cpu_socket = g_pal_sec.cpu_socket; - _DkCpuIdRetrieve(1, 0, words); ci->cpu_family = BIT_EXTRACT_LE(words[CPUID_WORD_EAX], 8, 12) + BIT_EXTRACT_LE(words[CPUID_WORD_EAX], 20, 28); @@ -788,15 +784,48 @@ int _DkGetCPUInfo(PAL_CPU_INFO* ci) { return rv; } +static int copy_resource_range(PAL_RES_RANGE_INFO* src, PAL_RES_RANGE_INFO* dest) { + uint64_t range_cnt = src->range_count; + PAL_RANGE_INFO* ranges = (PAL_RANGE_INFO*)malloc(range_cnt * sizeof(PAL_RANGE_INFO)); + if (!ranges) { + log_error("Range allocation failed"); + return -PAL_ERROR_NOMEM; + } + + memcpy(ranges, src->ranges, range_cnt * sizeof(PAL_RANGE_INFO)); + dest->ranges = ranges; + dest->range_count = range_cnt; + dest->resource_count = src->resource_count; + return 0; +} + +/* This function doesn't clean up resources on failure, as we terminate on failure. */ int _DkGetTopologyInfo(PAL_TOPO_INFO* topo_info) { - topo_info->num_online_nodes = g_pal_sec.topo_info.num_online_nodes; - topo_info->num_cache_index = g_pal_sec.topo_info.num_cache_index; - topo_info->core_topology = g_pal_sec.topo_info.core_topology; - topo_info->numa_topology = g_pal_sec.topo_info.numa_topology; - COPY_ARRAY(topo_info->online_logical_cores, g_pal_sec.topo_info.online_logical_cores); - COPY_ARRAY(topo_info->possible_logical_cores, g_pal_sec.topo_info.possible_logical_cores); - COPY_ARRAY(topo_info->online_nodes, g_pal_sec.topo_info.online_nodes); + int ret = copy_resource_range(&g_pal_sec.topo_info.online_logical_cores, + &topo_info->online_logical_cores); + if (ret < 0) { + log_error("Copying g_pal_sec.topo_info.online_logical_cores failed"); + return ret; + } + + ret = copy_resource_range(&g_pal_sec.topo_info.possible_logical_cores, + &topo_info->possible_logical_cores); + if (ret < 0) { + log_error("Copying g_pal_sec.topo_info.possible_logical_cores failed"); + return ret; + } + + ret = copy_resource_range(&g_pal_sec.topo_info.nodes, &topo_info->nodes); + if (ret < 0) { + log_error("Copying g_pal_sec.topo_info.nodes failed"); + return ret; + } + topo_info->num_cache_index = g_pal_sec.topo_info.num_cache_index; + topo_info->num_sockets = g_pal_sec.topo_info.num_sockets; + topo_info->physical_cores_per_socket = g_pal_sec.topo_info.physical_cores_per_socket; + topo_info->core_topology = g_pal_sec.topo_info.core_topology; + topo_info->numa_topology = g_pal_sec.topo_info.numa_topology; return 0; } diff --git a/Pal/src/host/Linux-SGX/enclave_ocalls.c b/Pal/src/host/Linux-SGX/enclave_ocalls.c index 200af9b06b..3707c9f595 100644 --- a/Pal/src/host/Linux-SGX/enclave_ocalls.c +++ b/Pal/src/host/Linux-SGX/enclave_ocalls.c @@ -1881,7 +1881,7 @@ int ocall_sched_setaffinity(void* tcs, size_t cpumask_size, void* cpu_mask) { static bool is_cpumask_valid(void* cpu_mask, size_t cpumask_size) { size_t max_cpumask_bits = cpumask_size * BITS_IN_BYTE; - size_t valid_cpumask_bits = g_pal_control.cpu_info.online_logical_cores; + size_t valid_cpumask_bits = g_pal_control.topo_info.online_logical_cores.resource_count; size_t invalid_bits = max_cpumask_bits - valid_cpumask_bits; if (invalid_bits == 0) diff --git a/Pal/src/host/Linux-SGX/pal_security.h b/Pal/src/host/Linux-SGX/pal_security.h index 126e309302..92019a3604 100644 --- a/Pal/src/host/Linux-SGX/pal_security.h +++ b/Pal/src/host/Linux-SGX/pal_security.h @@ -28,11 +28,6 @@ struct pal_sec { /* Child's stream FD created and sent over by parent. * If set to `PAL_IDX_POISON`, we have no parent (this is the first process). */ PAL_IDX stream_fd; - - PAL_NUM online_logical_cores; - PAL_NUM possible_logical_cores; - PAL_NUM physical_cores_per_socket; - int* cpu_socket; PAL_TOPO_INFO topo_info; #ifdef DEBUG diff --git a/Pal/src/host/Linux-SGX/sgx_main.c b/Pal/src/host/Linux-SGX/sgx_main.c index 10e857e34d..aea214be84 100644 --- a/Pal/src/host/Linux-SGX/sgx_main.c +++ b/Pal/src/host/Linux-SGX/sgx_main.c @@ -898,56 +898,6 @@ static int load_enclave(struct pal_enclave* enclave, char* args, size_t args_siz pal_sec->uid = DO_SYSCALL(getuid); pal_sec->gid = DO_SYSCALL(getgid); - /* we cannot use CPUID(0xb) because it counts even disabled-by-BIOS cores (e.g. HT cores); - * instead extract info on total number of logical cores, number of physical cores, - * SMT support etc. by parsing sysfs pseudo-files */ - int online_logical_cores = get_hw_resource("/sys/devices/system/cpu/online", /*count=*/true); - if (online_logical_cores < 0) - return online_logical_cores; - pal_sec->online_logical_cores = online_logical_cores; - - int possible_logical_cores = get_hw_resource("/sys/devices/system/cpu/possible", - /*count=*/true); - if (possible_logical_cores < 0) - return possible_logical_cores; - pal_sec->possible_logical_cores = possible_logical_cores; - - /* TODO: correctly support offline cores */ - if (possible_logical_cores > 0 && possible_logical_cores > online_logical_cores) { - log_warning("some CPUs seem to be offline; Gramine doesn't take this into account " - "which may lead to subpar performance"); - } - - int core_siblings = get_hw_resource("/sys/devices/system/cpu/cpu0/topology/core_siblings_list", - /*count=*/true); - if (core_siblings < 0) - return core_siblings; - - int smt_siblings = get_hw_resource("/sys/devices/system/cpu/cpu0/topology/thread_siblings_list", - /*count=*/true); - if (smt_siblings < 0) - return smt_siblings; - pal_sec->physical_cores_per_socket = core_siblings / smt_siblings; - - /* array of "logical core -> socket" mappings */ - int* cpu_socket = (int*)malloc(online_logical_cores * sizeof(int)); - if (!cpu_socket) - return -ENOMEM; - - char filename[128]; - for (int idx = 0; idx < online_logical_cores; idx++) { - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", idx); - cpu_socket[idx] = get_hw_resource(filename, /*count=*/false); - if (cpu_socket[idx] < 0) { - log_error("Cannot read %s", filename); - ret = cpu_socket[idx]; - free(cpu_socket); - return ret; - } - } - pal_sec->cpu_socket = cpu_socket; - ret = get_topology_info(&pal_sec->topo_info); if (ret < 0) return ret; diff --git a/Pal/src/host/Linux-common/topo_info.c b/Pal/src/host/Linux-common/topo_info.c index 5f32891d26..6332bc8a63 100644 --- a/Pal/src/host/Linux-common/topo_info.c +++ b/Pal/src/host/Linux-common/topo_info.c @@ -16,7 +16,17 @@ #include "syscall.h" #include "topo_info.h" -int get_hw_resource(const char* filename, bool count) { +int get_hw_resource(const char* filename, bool count, PAL_RES_RANGE_INFO* res_info, + enum size_multiplier* size_mult) { + if (res_info) { + res_info->resource_count = 0; + res_info->range_count = 0; + res_info->ranges = NULL; + } + + if (size_mult) + *size_mult = MULTIPLIER_NONE; + int fd = DO_SYSCALL(open, filename, O_RDONLY | O_CLOEXEC, 0); if (fd < 0) return fd; @@ -46,15 +56,32 @@ int get_hw_resource(const char* filename, bool count) { /* caller wants to read an int stored in the file */ if (!count) { - if (*end == '\n' || *end == '\0') + if (*end == '\n' || *end == '\0' || *end == 'K' || *end == 'M' || *end == 'G') { retval = (int)firstint; + if (size_mult) { + if (*end == 'K') { + *size_mult = MULTIPLIER_KB; + } else if (*end == 'M') { + *size_mult = MULTIPLIER_MB; + } else if (*end == 'G') { + *size_mult = MULTIPLIER_GB; + } else { + *size_mult = MULTIPLIER_NONE; + } + } + } return retval; } - /* caller wants to count the number of HW resources */ - if (*end == '\0' || *end == ',' || *end == '\n') { + uint64_t range_start; + uint64_t range_end; + /* caller wants to count the number of HW resources. If `res_info` struct is passed, range + * information is stored in it as we parse the string. */ + if (*end == '\0' || *end == ',' || *end == '\n' || *end == ' ') { /* single HW resource index, count as one more */ resource_cnt++; + range_start = firstint; + range_end = firstint; } else if (*end == '-') { /* HW resource range, count how many HW resources are in range */ ptr = end + 1; @@ -69,12 +96,39 @@ int get_hw_resource(const char* filename, bool count) { return -EINVAL; resource_cnt += (int)secondint - (int)firstint + 1; //inclusive (e.g., 0-7, or 8-16) } + range_start = firstint; + range_end = secondint; + } else { + /* Illegal character found */ + return -EINVAL; + } + + if (res_info) { + res_info->range_count++; + + /* Realloc as we identify new range when parsing */ + size_t new_sz = sizeof(PAL_RANGE_INFO) * res_info->range_count; + size_t old_sz = new_sz - sizeof(PAL_RANGE_INFO); + void* tmp = malloc(new_sz); + if (!tmp) + return -ENOMEM; + + if (res_info->ranges) { + memcpy(tmp, res_info->ranges, old_sz); + free(res_info->ranges); + } + res_info->ranges = tmp; + res_info->ranges[res_info->range_count - 1].start = range_start; + res_info->ranges[res_info->range_count - 1].end = range_end; } ptr = end; } - if (count && resource_cnt > 0) + if (count && resource_cnt > 0) { retval = resource_cnt; + if (res_info) + res_info->resource_count = resource_cnt; + } return retval; } @@ -92,14 +146,6 @@ int read_file_buffer(const char* filename, char* buf, size_t count) { return ret; } -#define READ_FILE_BUFFER(filepath, buf, failure_label) \ - ({ \ - ret = read_file_buffer(filepath, buf, ARRAY_SIZE(buf)-1); \ - if (ret < 0) \ - goto failure_label; \ - buf[ret] = '\0'; \ - }) - /* Returns number of cache levels present on this system by counting "indexX" dir entries under * `/sys/devices/system/cpu/cpuX/cache` on success and negative UNIX error code on failure. */ static int get_num_cache_level(const char* path) { @@ -135,44 +181,80 @@ static int get_num_cache_level(const char* path) { static int get_cache_topo_info(int num_cache_lvl, int core_idx, PAL_CORE_CACHE_INFO** cache_info) { int ret; - char filename[128]; PAL_CORE_CACHE_INFO* core_cache = (PAL_CORE_CACHE_INFO*)malloc(num_cache_lvl * sizeof(PAL_CORE_CACHE_INFO)); if (!core_cache) { return -ENOMEM; } + char filename[128]; for (int lvl = 0; lvl < num_cache_lvl; lvl++) { snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map", core_idx, lvl); - READ_FILE_BUFFER(filename, core_cache[lvl].shared_cpu_map, /*failure_label=*/out_cache); + "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_list", core_idx, lvl); + ret = get_hw_resource(filename, /*count=*/true, &core_cache[lvl].shared_cpu_map, + /*size_mult=*/NULL); + if (ret < 0) + goto out_cache; - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%d/cache/index%d/level", core_idx, lvl); - READ_FILE_BUFFER(filename, core_cache[lvl].level, /*failure_label=*/out_cache); + snprintf(filename, sizeof(filename), "/sys/devices/system/cpu/cpu%d/cache/index%d/level", + core_idx, lvl); + int level = get_hw_resource(filename, /*count=*/false, NULL, /*size_mult=*/NULL); + if (level < 0) + goto out_cache; + core_cache[lvl].level = level; - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%d/cache/index%d/type", core_idx, lvl); - READ_FILE_BUFFER(filename, core_cache[lvl].type, /*failure_label=*/out_cache); + char type[PAL_SYSFS_BUF_FILESZ] = {'\0'}; + snprintf(filename, sizeof(filename), "/sys/devices/system/cpu/cpu%d/cache/index%d/type", + core_idx, lvl); - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%d/cache/index%d/size", core_idx, lvl); - READ_FILE_BUFFER(filename, core_cache[lvl].size, /*failure_label=*/out_cache); + ret = read_file_buffer(filename, type, ARRAY_SIZE(type)-1); + if (ret < 0) + goto out_cache; + type[ret] = '\0'; + + if (!strcmp(type, "Unified\n")) { + core_cache[lvl].type = CACHE_TYPE_UNIFIED; + } else if (!strcmp(type, "Instruction\n")) { + core_cache[lvl].type = CACHE_TYPE_INSTRUCTION; + } else if (!strcmp(type, "Data\n")) { + core_cache[lvl].type = CACHE_TYPE_DATA; + } else { + ret = -EINVAL; + goto out_cache; + } + + enum size_multiplier size_mult; + snprintf(filename, sizeof(filename), "/sys/devices/system/cpu/cpu%d/cache/index%d/size", + core_idx, lvl); + int size = get_hw_resource(filename, /*count=*/false, NULL, &size_mult); + if (size < 0) + goto out_cache; + core_cache[lvl].size = size; + core_cache[lvl].size_multiplier = size_mult; snprintf(filename, sizeof(filename), "/sys/devices/system/cpu/cpu%d/cache/index%d/coherency_line_size", core_idx, lvl); - READ_FILE_BUFFER(filename, core_cache[lvl].coherency_line_size, - /*failure_label=*/out_cache); + int coherency_line_size = get_hw_resource(filename, /*count=*/false, NULL, + /*size_mult=*/NULL); + if (coherency_line_size < 0) + goto out_cache; + core_cache[lvl].coherency_line_size = coherency_line_size; snprintf(filename, sizeof(filename), "/sys/devices/system/cpu/cpu%d/cache/index%d/number_of_sets", core_idx, lvl); - READ_FILE_BUFFER(filename, core_cache[lvl].number_of_sets, /*failure_label=*/out_cache); + int num_sets = get_hw_resource(filename, /*count=*/false, NULL, /*size_mult=*/NULL); + if (num_sets < 0) + goto out_cache; + core_cache[lvl].number_of_sets = num_sets; snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%d/cache/index%d/physical_line_partition", core_idx, - lvl); - READ_FILE_BUFFER(filename, core_cache[lvl].physical_line_partition, - /*failure_label=*/out_cache); + "/sys/devices/system/cpu/cpu%d/cache/index%d/physical_line_partition", core_idx, lvl); + int physical_line_partition = get_hw_resource(filename, /*count=*/false, NULL, + /*size_mult=*/NULL); + if (physical_line_partition < 0) + goto out_cache; + core_cache[lvl].physical_line_partition = physical_line_partition; + } *cache_info = core_cache; return 0; @@ -184,16 +266,29 @@ static int get_cache_topo_info(int num_cache_lvl, int core_idx, PAL_CORE_CACHE_I /* Get core topology-related info */ static int get_core_topo_info(PAL_TOPO_INFO* topo_info) { - int ret; - READ_FILE_BUFFER("/sys/devices/system/cpu/online", topo_info->online_logical_cores, - /*failure_label=*/out); + int ret = get_hw_resource("/sys/devices/system/cpu/online", /*count=*/true, + &topo_info->online_logical_cores, /*size_mult=*/NULL); + if (ret < 0) + return ret; - READ_FILE_BUFFER("/sys/devices/system/cpu/possible", topo_info->possible_logical_cores, - /*failure_label=*/out); + ret = get_hw_resource("/sys/devices/system/cpu/possible", /*count=*/true, + &topo_info->possible_logical_cores, /*size_mult=*/NULL); + if (ret < 0) + return ret; + + if (topo_info->online_logical_cores.resource_count > INT32_MAX) + return -EINVAL; + int online_logical_cores = topo_info->online_logical_cores.resource_count; + + if (topo_info->possible_logical_cores.resource_count > INT32_MAX) + return -EINVAL; + int possible_logical_cores = topo_info->possible_logical_cores.resource_count; - int online_logical_cores = get_hw_resource("/sys/devices/system/cpu/online", /*count=*/true); - if (online_logical_cores < 0) - return online_logical_cores; + /* TODO: correctly support offline cores */ + if (possible_logical_cores > online_logical_cores) { + log_warning("Some CPUs seem to be offline; Gramine doesn't take this into account which " + "may lead to subpar performance"); + } int num_cache_lvl = get_num_cache_level("/sys/devices/system/cpu/cpu0/cache"); if (num_cache_lvl < 0) @@ -205,52 +300,70 @@ static int get_core_topo_info(PAL_TOPO_INFO* topo_info) { if (!core_topology) return -ENOMEM; + int current_max_socket = -1; char filename[128]; for (int idx = 0; idx < online_logical_cores; idx++) { /* cpu0 is always online and thus the "online" file is not present. */ if (idx != 0) { snprintf(filename, sizeof(filename), "/sys/devices/system/cpu/cpu%d/online", idx); - READ_FILE_BUFFER(filename, core_topology[idx].is_logical_core_online, - /*failure_label=*/out_topology); + ret = get_hw_resource(filename, /*count=*/false, NULL, /*size_mult=*/NULL); + if (ret < 0) + goto out_topology; + core_topology[idx].is_logical_core_online = ret; } snprintf(filename, sizeof(filename), "/sys/devices/system/cpu/cpu%d/topology/core_id", idx); - READ_FILE_BUFFER(filename, core_topology[idx].core_id, /*failure_label=*/out_topology); + ret = get_hw_resource(filename, /*count=*/false, NULL, /*size_mult=*/NULL); + if (ret < 0) + goto out_topology; + core_topology[idx].core_id = ret; snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%d/topology/core_siblings", idx); - READ_FILE_BUFFER(filename, core_topology[idx].core_siblings, - /*failure_label=*/out_topology); + "/sys/devices/system/cpu/cpu%d/topology/core_siblings_list", idx); + ret = get_hw_resource(filename, /*count=*/true, &core_topology[idx].core_siblings, + /*size_mult=*/NULL); + if (ret < 0) + goto out_topology; + + snprintf(filename, sizeof(filename), + "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", idx); + ret = get_hw_resource(filename, /*count=*/true, &core_topology[idx].thread_siblings, + /*size_mult=*/NULL); + if (ret < 0) + goto out_topology; snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", idx); - READ_FILE_BUFFER(filename, core_topology[idx].thread_siblings, - /*failure_label=*/out_topology); + "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", idx); + ret = get_hw_resource(filename, /*count=*/false, NULL, /*size_mult=*/NULL); + if (ret < 0) + goto out_topology; + core_topology[idx].cpu_socket = ret; + if (ret > current_max_socket) + current_max_socket = ret; ret = get_cache_topo_info(num_cache_lvl, idx, &core_topology[idx].cache); if (ret < 0) goto out_topology; } topo_info->core_topology = core_topology; + topo_info->num_sockets = current_max_socket + 1; + topo_info->physical_cores_per_socket = core_topology[0].core_siblings.resource_count / + core_topology[0].thread_siblings.resource_count; return 0; out_topology: free(core_topology); -out: return ret; } /* Get NUMA topology-related info */ static int get_numa_topo_info(PAL_TOPO_INFO* topo_info) { - int ret; - READ_FILE_BUFFER("/sys/devices/system/node/online", topo_info->online_nodes, - /*failure_label=*/out); - - int num_nodes = get_hw_resource("/sys/devices/system/node/online", /*count=*/true); - if (num_nodes < 0) - return num_nodes; - topo_info->num_online_nodes = num_nodes; + int ret = get_hw_resource("/sys/devices/system/node/online", /*count=*/true, + &topo_info->nodes, /*size_mult=*/NULL); + if (ret < 0) + return ret; + int num_nodes = topo_info->nodes.resource_count; PAL_NUMA_TOPO_INFO* numa_topology = (PAL_NUMA_TOPO_INFO*)malloc(num_nodes * sizeof(PAL_NUMA_TOPO_INFO)); if (!numa_topology) @@ -258,23 +371,28 @@ static int get_numa_topo_info(PAL_TOPO_INFO* topo_info) { char filename[128]; for (int idx = 0; idx < num_nodes; idx++) { - snprintf(filename, sizeof(filename), "/sys/devices/system/node/node%d/cpumap", idx); - READ_FILE_BUFFER(filename, numa_topology[idx].cpumap, /*failure_label=*/out_topology); + snprintf(filename, sizeof(filename), "/sys/devices/system/node/node%d/cpulist", idx); + ret = get_hw_resource(filename, /*count=*/true, &numa_topology[idx].cpumap, + /*size_mult=*/NULL); + if (ret < 0) + goto out_topology; snprintf(filename, sizeof(filename), "/sys/devices/system/node/node%d/distance", idx); - READ_FILE_BUFFER(filename, numa_topology[idx].distance, /*failure_label=*/out_topology); + ret = get_hw_resource(filename, /*count=*/true, &numa_topology[idx].distance, + /*size_mult=*/NULL); + if (ret < 0) + goto out_topology; /* Since our /sys fs doesn't support writes, set persistent hugepages to their default value * of zero */ - memcpy(numa_topology[idx].hugepages[HUGEPAGES_2M].nr_hugepages, "0\n", 3); - memcpy(numa_topology[idx].hugepages[HUGEPAGES_1G].nr_hugepages, "0\n", 3); + numa_topology[idx].nr_hugepages[HUGEPAGES_2M] = 0; + numa_topology[idx].nr_hugepages[HUGEPAGES_1G] = 0; } topo_info->numa_topology = numa_topology; return 0; out_topology: free(numa_topology); -out: return ret; } diff --git a/Pal/src/host/Linux/arch/x86_64/db_arch_misc.c b/Pal/src/host/Linux/arch/x86_64/db_arch_misc.c index 9dcc677728..8da55b22d8 100644 --- a/Pal/src/host/Linux/arch/x86_64/db_arch_misc.c +++ b/Pal/src/host/Linux/arch/x86_64/db_arch_misc.c @@ -107,65 +107,6 @@ int _DkGetCPUInfo(PAL_CPU_INFO* ci) { brand[BRAND_SIZE - 1] = '\0'; ci->cpu_brand = brand; - /* we cannot use CPUID(0xb) because it counts even disabled-by-BIOS cores (e.g. HT cores); - * instead extract info on total number of logical cores, number of physical cores, - * SMT support etc. by parsing sysfs pseudo-files */ - int online_logical_cores = get_hw_resource("/sys/devices/system/cpu/online", /*count=*/true); - if (online_logical_cores < 0) { - rv = unix_to_pal_error(online_logical_cores); - goto out_brand; - } - ci->online_logical_cores = online_logical_cores; - - int possible_logical_cores = get_hw_resource("/sys/devices/system/cpu/possible", - /*count=*/true); - if (possible_logical_cores < 0) { - rv = unix_to_pal_error(possible_logical_cores); - goto out_brand; - } - ci->possible_logical_cores = possible_logical_cores; - - /* TODO: correctly support offline cores */ - if (possible_logical_cores > 0 && possible_logical_cores > online_logical_cores) { - log_warning("some CPUs seem to be offline; Gramine doesn't take this into account which " - "may lead to subpar performance"); - } - - int core_siblings = get_hw_resource("/sys/devices/system/cpu/cpu0/topology/core_siblings_list", - /*count=*/true); - if (core_siblings < 0) { - rv = unix_to_pal_error(core_siblings); - goto out_brand; - } - - int smt_siblings = get_hw_resource("/sys/devices/system/cpu/cpu0/topology/thread_siblings_list", - /*count=*/true); - if (smt_siblings < 0) { - rv = unix_to_pal_error(smt_siblings); - goto out_brand; - } - ci->physical_cores_per_socket = core_siblings / smt_siblings; - - /* array of "logical core -> socket" mappings */ - int* cpu_socket = (int*)malloc(online_logical_cores * sizeof(int)); - if (!cpu_socket) { - rv = -PAL_ERROR_NOMEM; - goto out_brand; - } - - char filename[128]; - for (int idx = 0; idx < online_logical_cores; idx++) { - snprintf(filename, sizeof(filename), - "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", idx); - cpu_socket[idx] = get_hw_resource(filename, /*count=*/false); - if (cpu_socket[idx] < 0) { - log_warning("Cannot read %s", filename); - rv = unix_to_pal_error(cpu_socket[idx]); - goto out_phy_id; - } - } - ci->cpu_socket = cpu_socket; - cpuid(1, 0, words); ci->cpu_family = BIT_EXTRACT_LE(words[CPUID_WORD_EAX], 8, 12); ci->cpu_model = BIT_EXTRACT_LE(words[CPUID_WORD_EAX], 4, 8); @@ -181,7 +122,7 @@ int _DkGetCPUInfo(PAL_CPU_INFO* ci) { char* flags = malloc(fmax); if (!flags) { rv = -PAL_ERROR_NOMEM; - goto out_phy_id; + goto out_brand; } for (int i = 0; i < 32; i++) { @@ -218,8 +159,6 @@ int _DkGetCPUInfo(PAL_CPU_INFO* ci) { return rv; out_flags: free(flags); -out_phy_id: - free(cpu_socket); out_brand: free(brand); out_vendor_id: diff --git a/common/include/api.h b/common/include/api.h index ef25a54886..50001b4e7c 100644 --- a/common/include/api.h +++ b/common/include/api.h @@ -81,6 +81,7 @@ typedef ptrdiff_t ssize_t; #define BITS_IN_BYTE 8 #define BITS_IN_TYPE(type) (sizeof(type) * BITS_IN_BYTE) +#define BITS_TO_INTS(nr) DIV_ROUND_UP(nr, BITS_IN_TYPE(int)) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_IN_TYPE(long)) /* Note: This macro is not intended for use when nbits == BITS_IN_TYPE(type) */ #define SET_HIGHEST_N_BITS(type, nbits) (~(((uint64_t)1 << (BITS_IN_TYPE(type) - (nbits))) - 1))