From d5160d2744f65bd561305aca5b579ad03bd6bc3a Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Fri, 19 Jul 2024 02:53:31 -0500
Subject: [PATCH] powerpc/vpa_pmu: Add interface to expose vpa counters via
 perf

The pseries SPLPAR machines can retrieve a log of dispatch and preempt
events from the hypervisor using data from Disptach Trace Log(DTL) buffer.
With this information, user can retrieve when and why each dispatch & preempt
has occured. Add an interface to expose the Virtual Processor Area(VPA)
dtl counters via perf.

The following events are available and exposed in sysfs:

  vpa_dtl/dtl_cede/ - Trace voluntary (OS initiated) virtual processor waits
  vpa_dtl/dtl_preempt/ - Trace time slice preempts
  vpa_dtl/dtl_fault/ - Trace virtual partition memory page faults.
  vpa_dtl/dtl_all/ - Trace all (dtl_cede/dtl_preempt/dtl_fault)

Added interface defines supported event list, config fields for the
event attributes and their corresponding bit values which are exported
via sysfs. User could use the standard perf tool to access
perf events exposed via vpa-pmu.

The VPA DTL PMU counters do not interrupt on overflow. Therefore, the kernel
needs to poll the counters to avoid missing an overflow. Added hrtimer
code to do this. The timer interval can be provided by user via
sample_period field. There is one hrtimer per-cpu.

Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
---
 arch/powerpc/perf/Makefile  |   2 +-
 arch/powerpc/perf/vpa-pmu.c | 418 ++++++++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h  |   1 +
 3 files changed, 420 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/perf/vpa-pmu.c

diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index 4f53d0b97539b..7118b5c0c6419 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_PPC_POWERNV)	+= imc-pmu.o
 obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o
 obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o
 
-obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o
+obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o vpa-pmu.o
 
 obj-$(CONFIG_PPC_8xx) += 8xx-pmu.o
 
diff --git a/arch/powerpc/perf/vpa-pmu.c b/arch/powerpc/perf/vpa-pmu.c
new file mode 100644
index 0000000000000..e5a027f4313b2
--- /dev/null
+++ b/arch/powerpc/perf/vpa-pmu.c
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Perf interface to expose Virtual Processor counters.
+ *
+ * Copyright (C) 2024 Kajol Jain, IBM Corporation
+ */
+
+#define pr_fmt(fmt) "vpa-pmu: " fmt
+
+#include <asm/dtl.h>
+#include <linux/perf_event.h>
+#include <asm/plpar_wrappers.h>
+
+#define EVENT(_name, _code)     enum{_name = _code}
+
+/*
+ * Dispatch Trace log Event codes.
+ */
+EVENT(DTL_CEDE,	0x1);
+EVENT(DTL_PREEMPT,	0x2);
+EVENT(DTL_FAULT,	0x3);
+EVENT(DTL_ALL,	0x4);
+
+GENERIC_EVENT_ATTR(dtl_cede, DTL_CEDE);
+GENERIC_EVENT_ATTR(dtl_preempt, DTL_PREEMPT);
+GENERIC_EVENT_ATTR(dtl_fault, DTL_FAULT);
+GENERIC_EVENT_ATTR(dtl_all, DTL_ALL);
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *events_attr[] = {
+	GENERIC_EVENT_PTR(DTL_CEDE),
+	GENERIC_EVENT_PTR(DTL_PREEMPT),
+	GENERIC_EVENT_PTR(DTL_FAULT),
+	GENERIC_EVENT_PTR(DTL_ALL),
+	NULL
+};
+
+static struct attribute_group event_group = {
+	.name = "events",
+	.attrs = events_attr,
+};
+
+static struct attribute *format_attrs[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static const struct attribute_group format_group = {
+	.name = "format",
+	.attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+	&format_group,
+	&event_group,
+	NULL,
+};
+
+/*
+ * Dispatch trace log event enable mask defined in file dtl.h:
+ *   DTL_LOG_CEDE: voluntary virtual processor waits
+ *   DTL_LOG_PREEMPT: time-slice preempts
+ *   DTL_LOG_FAULT: virtual partition memory page faults
+ *   DTL_LOG_ALL: (DTL_LOG_CEDE | DTL_LOG_PREEMPT | DTL_LOG_FAULT)
+ */
+static const u8 vpa_dtl_enable_mask[] = {
+	[1] = DTL_LOG_CEDE,
+	[2] = DTL_LOG_PREEMPT,
+	[3] = DTL_LOG_FAULT,
+	[4] = DTL_LOG_ALL,
+};
+
+struct vpa_dtl {
+	struct dtl_entry	*buf;
+	u64			last_idx;
+	bool			active_lock;
+};
+
+static DEFINE_PER_CPU(struct vpa_dtl, vpa_cpu_dtl);
+
+/* variable to capture reference count for the active dtl threads */
+static int dtl_global_refc;
+static spinlock_t dtl_global_lock = __SPIN_LOCK_UNLOCKED(dtl_global_lock);
+
+/*
+ * Function to parse dispatch trace log data and to prepare
+ * the perf raw sample.
+ */
+static void vpa_dtl_dump_sample_data(struct perf_event *event)
+{
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	int overflow;
+	u64 cur_idx, last_idx, i;
+	char *buf;
+
+	/* actual number of entries read */
+	long n_read = 0, read_size = 0;
+
+	/* number of entries added to dtl buffer */
+	long n_req;
+
+	struct vpa_dtl *dtl = &per_cpu(vpa_cpu_dtl, event->cpu);
+	int version = 1;
+
+	/* Setup perf sample */
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+	memset(&regs, 0, sizeof(regs));
+	memset(&raw, 0, sizeof(raw));
+
+	cur_idx = be64_to_cpu(lppaca_of(event->cpu).dtl_idx);
+	last_idx = dtl->last_idx;
+
+	if (last_idx + N_DISPATCH_LOG <= cur_idx)
+		last_idx = cur_idx - N_DISPATCH_LOG + 1;
+
+	n_req = cur_idx - last_idx;
+
+	/* no new entry added to the buffer, return */
+	if (n_req <= 0)
+		return;
+
+	dtl->last_idx = last_idx + n_req;
+
+	buf = kzalloc((n_req * sizeof(struct dtl_entry)) + sizeof(version) +
+			sizeof(tb_ticks_per_sec) + sizeof(n_req), GFP_KERNEL);
+	raw.frag.data = buf;
+
+	/* Save current version of dtl sampling support */
+	memcpy(buf, &version, sizeof(version));
+	buf += sizeof(version);
+
+	/* Save tb_ticks_per_sec to convert timebase to sec */
+	memcpy(buf, &tb_ticks_per_sec, sizeof(tb_ticks_per_sec));
+	buf += sizeof(tb_ticks_per_sec);
+
+	/* Save total number of dtl entries added to the dtl buffer */
+	memcpy(buf, &n_req, sizeof(n_req));
+	buf += sizeof(n_req);
+
+	i = last_idx % N_DISPATCH_LOG;
+
+	/* read the tail of the buffer if we've wrapped */
+	if (i + n_req > N_DISPATCH_LOG) {
+		read_size = N_DISPATCH_LOG - i;
+		memcpy(buf, &dtl->buf[i], read_size * sizeof(struct dtl_entry));
+		i = 0;
+		n_req -= read_size;
+		n_read += read_size;
+		buf += read_size * sizeof(struct dtl_entry);
+	}
+
+	/* .. and now the head */
+	memcpy(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry));
+	n_read += n_req;
+
+	raw.frag.size = n_read * sizeof(struct dtl_entry) + sizeof(version) +
+		sizeof(tb_ticks_per_sec) + sizeof(n_req);
+
+	perf_sample_save_raw_data(&data, &raw);
+	overflow = perf_event_overflow(event, &data, &regs);
+}
+
+/*
+ * The VPA Dispatch Trace log counters do not interrupt on overflow.
+ * Therefore, the kernel needs to poll the counters to avoid missing
+ * an overflow using hrtimer. The timer interval is based on sample_period
+ * count provided by user, and default to 10000 if the input is smaller than
+ * 10000.
+ */
+static enum hrtimer_restart vpa_dtl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+	struct perf_event *event;
+	u64 period;
+
+	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
+		return HRTIMER_NORESTART;
+
+	vpa_dtl_dump_sample_data(event);
+	period = max_t(u64, 10000, event->hw.sample_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+	return HRTIMER_RESTART;
+}
+
+static void vpa_dtl_start_hrtimer(struct perf_event *event)
+{
+	u64 period;
+	struct hw_perf_event *hwc = &event->hw;
+
+	period = max_t(u64, 10000, hwc->sample_period);
+	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED);
+}
+
+static void vpa_dtl_stop_hrtimer(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hrtimer_cancel(&hwc->hrtimer);
+}
+
+static void vpa_dtl_reset_global_refc(struct perf_event *event)
+{
+	spin_lock(&dtl_global_lock);
+	dtl_global_refc--;
+	if (dtl_global_refc <= 0) {
+		dtl_global_refc = 0;
+		write_unlock(&dtl_access_lock);
+	}
+	spin_unlock(&dtl_global_lock);
+}
+
+static int vpa_dtl_event_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct vpa_dtl *dtl = &per_cpu(vpa_cpu_dtl, event->cpu);
+
+	/* test the event attr type for PMU enumeration */
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (!perfmon_capable())
+		return -EACCES;
+
+	/* Return if this is a counting event */
+	if (!is_sampling_event(event))
+		return -EOPNOTSUPP;
+
+	if (!(event->attr.sample_type & PERF_SAMPLE_RAW))
+		return -EOPNOTSUPP;
+
+	/* Invalid eventcode */
+	if (event->attr.config <= 0 || event->attr.config > 4)
+		return -EINVAL;
+
+	/* ensure there are no other conflicting dtl users */
+	spin_lock(&dtl_global_lock);
+
+	/* ensure there are no other conflicting dtl users */
+	if (dtl_global_refc == 0 && !write_trylock(&dtl_access_lock)) {
+		spin_unlock(&dtl_global_lock);
+		return -EBUSY;
+	}
+	dtl_global_refc++;
+	dtl->active_lock = true;
+	spin_unlock(&dtl_global_lock);
+
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = vpa_dtl_hrtimer_handle;
+
+	event->destroy = vpa_dtl_reset_global_refc;
+	return 0;
+}
+
+static int vpa_dtl_event_add(struct perf_event *event, int flags)
+{
+	int ret, hwcpu;
+	unsigned long addr;
+	struct vpa_dtl *dtl = &per_cpu(vpa_cpu_dtl, event->cpu);
+
+	/*
+	 * Register our dtl buffer with the hypervisor. The
+	 * HV expects the buffer size to be passed in the second
+	 * word of the buffer.
+	 */
+	((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES);
+	dtl->last_idx = 0;
+
+	hwcpu = get_hard_smp_processor_id(event->cpu);
+	addr = __pa(dtl->buf);
+
+	ret = register_dtl(hwcpu, addr);
+	if (ret) {
+		pr_warn("DTL registration for cpu %d (hw %d) failed with %d\n",
+			event->cpu, hwcpu, ret);
+		return ret;
+	}
+
+	/* set our initial buffer indices */
+	lppaca_of(event->cpu).dtl_idx = 0;
+
+	/*
+	 * Ensure that our updates to the lppaca fields have
+	 * occurred before we actually enable the logging
+	 */
+	smp_wmb();
+
+	/* enable event logging */
+	lppaca_of(event->cpu).dtl_enable_mask = vpa_dtl_enable_mask[event->attr.config];
+
+	vpa_dtl_start_hrtimer(event);
+
+	return 0;
+}
+
+static void vpa_dtl_event_del(struct perf_event *event, int flags)
+{
+	int hwcpu = get_hard_smp_processor_id(event->cpu);
+
+	vpa_dtl_dump_sample_data(event);
+	vpa_dtl_stop_hrtimer(event);
+	unregister_dtl(hwcpu);
+	lppaca_of(event->cpu).dtl_enable_mask = 0x0;
+}
+
+static void vpa_dtl_event_read(struct perf_event *event)
+{
+	/*
+	 * This function defination is empty as dtl_dump_sample_data
+	 * is used to parse and dump the dispatch trace log data,
+	 * to perf raw sample.
+	 */
+}
+
+/* Allocate dtl buffer memory for given cpu. */
+static int vpa_dtl_mem_alloc(int cpu)
+{
+	struct vpa_dtl *dtl = &per_cpu(vpa_cpu_dtl, cpu);
+	struct dtl_entry *buf = NULL;
+
+	dtl->active_lock = false;
+
+	/* Check for dispatch trace log buffer cache */
+	if (!dtl_cache)
+		return -ENOMEM;
+
+	buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(cpu));
+	if (!buf) {
+		pr_warn("buffer allocation failed for cpu %d\n", cpu);
+		return -ENOMEM;
+	}
+	dtl->buf = buf;
+	return 0;
+}
+
+static int vpa_dtl_cpu_online(unsigned int cpu)
+{
+	return vpa_dtl_mem_alloc(cpu);
+}
+
+static int vpa_dtl_cpu_offline(unsigned int cpu)
+{
+	struct vpa_dtl *dtl = &per_cpu(vpa_cpu_dtl, cpu);
+
+	/* Reduce the ref count if dtl event running on this cpu */
+	spin_lock(&dtl_global_lock);
+	if (dtl_global_refc && dtl->active_lock) {
+		dtl_global_refc--;
+		dtl->active_lock = false;
+	}
+	if (dtl_global_refc <= 0) {
+		dtl_global_refc = 0;
+		write_unlock(&dtl_access_lock);
+	}
+	spin_unlock(&dtl_global_lock);
+	return 0;
+}
+
+static int vpa_dtl_cpu_hotplug_init(void)
+{
+	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_VPA_DTL_ONLINE,
+			  "perf/powerpc/vpa_pmu:online",
+			  vpa_dtl_cpu_online,
+			  vpa_dtl_cpu_offline);
+}
+
+static void vpa_dtl_clear_memory(void)
+{
+	int i;
+
+	for_each_online_cpu(i) {
+		struct vpa_dtl *dtl = &per_cpu(vpa_cpu_dtl, i);
+
+		kmem_cache_free(dtl_cache, dtl->buf);
+		dtl->buf = NULL;
+	}
+}
+
+static struct pmu vpa_dtl_pmu = {
+	.task_ctx_nr = perf_invalid_context,
+
+	.name = "vpa_dtl",
+	.attr_groups = attr_groups,
+	.event_init  = vpa_dtl_event_init,
+	.add         = vpa_dtl_event_add,
+	.del         = vpa_dtl_event_del,
+	.read        = vpa_dtl_event_read,
+	.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_EXCLUSIVE,
+};
+
+static int vpa_pmu_init(void)
+{
+	int r;
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		pr_debug("not a shared virtualized system, not enabling\n");
+		return -ENODEV;
+	}
+
+	/* init cpuhotplug */
+	r = vpa_dtl_cpu_hotplug_init();
+	if (r) {
+		vpa_dtl_clear_memory();
+		return r;
+	}
+
+	r = perf_pmu_register(&vpa_dtl_pmu, vpa_dtl_pmu.name, -1);
+	if (r)
+		return r;
+
+	return 0;
+}
+
+device_initcall(vpa_pmu_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 7a5785f405b62..c335af5a5a8ee 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -232,6 +232,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
 	CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE,
+	CPUHP_AP_PERF_POWERPC_VPA_DTL_ONLINE,
 	CPUHP_AP_PERF_CSKY_ONLINE,
 	CPUHP_AP_TMIGR_ONLINE,
 	CPUHP_AP_WATCHDOG_ONLINE,