Skip to content

Commit

Permalink
Merge pull request snabbco#191 from vavrusa/linux-perf-open
Browse files Browse the repository at this point in the history
linux: performance monitoring API
  • Loading branch information
justincormack committed Apr 21, 2016
2 parents 0511fb8 + 93558c1 commit f245114
Show file tree
Hide file tree
Showing 12 changed files with 592 additions and 2 deletions.
5 changes: 5 additions & 0 deletions syscall/linux/c.lua
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,11 @@ if sys.bpf then
return syscall(sys.bpf, int(cmd), void(attr), u64(ffi.sizeof('union bpf_attr')))
end
end
if sys.perf_event_open then
function C.perf_event_open(attr, pid, cpu, group_fd, flags)
return syscall(sys.perf_event_open, void(attr), int(pid), int(cpu), int(group_fd), ulong(flags))
end
end

-- socketcalls
if not sys.socketcall then
Expand Down
148 changes: 148 additions & 0 deletions syscall/linux/constants.lua
Original file line number Diff line number Diff line change
Expand Up @@ -2109,6 +2109,154 @@ c.BPF_PROG = strflag {
SCHED_ACT = 4,
}

-- Linux performance monitoring
-- perf_event_attr.type
c.PERF_TYPE = strflag {
HARDWARE = 0,
SOFTWARE = 1,
TRACEPOINT = 2,
HW_CACHE = 3,
RAW = 4,
BREAKPOINT = 5,
}

-- perf_event_attr.event_id
c.PERF_COUNT = strflag {
-- Generalized performance event event_id types
HW_CPU_CYCLES = 0,
HW_INSTRUCTIONS = 1,
HW_CACHE_REFERENCES = 2,
HW_CACHE_MISSES = 3,
HW_BRANCH_INSTRUCTIONS = 4,
HW_BRANCH_MISSES = 5,
HW_BUS_CYCLES = 6,
HW_STALLED_CYCLES_FRONTEND = 7,
HW_STALLED_CYCLES_BACKEND = 8,
HW_REF_CPU_CYCLES = 9,
-- Generalized hardware cache events
HW_CACHE_L1D = 0,
HW_CACHE_L1I = 1,
HW_CACHE_LL = 2,
HW_CACHE_DTLB = 3,
HW_CACHE_ITLB = 4,
HW_CACHE_BPU = 5,
HW_CACHE_NODE = 6,
HW_CACHE_OP_READ = 0,
HW_CACHE_OP_WRITE = 1,
HW_CACHE_OP_PREFETCH = 2,
HW_CACHE_RESULT_ACCESS = 0,
HW_CACHE_RESULT_MISS = 1,
-- Special "software" events provided by the kernel
SW_CPU_CLOCK = 0,
SW_TASK_CLOCK = 1,
SW_PAGE_FAULTS = 2,
SW_CONTEXT_SWITCHES = 3,
SW_CPU_MIGRATIONS = 4,
SW_PAGE_FAULTS_MIN = 5,
SW_PAGE_FAULTS_MAJ = 6,
SW_ALIGNMENT_FAULTS = 7,
SW_EMULATION_FAULTS = 8,
SW_DUMMY = 9,
SW_BPF_OUTPUT = 10,
}

-- Bits that can be set in perf_event_attr.sample_type to request information
c.PERF_SAMPLE = multiflags {
IP = bit.lshift(1, 0),
TID = bit.lshift(1, 1),
TIME = bit.lshift(1, 2),
ADDR = bit.lshift(1, 3),
READ = bit.lshift(1, 4),
CALLCHAIN = bit.lshift(1, 5),
ID = bit.lshift(1, 6),
CPU = bit.lshift(1, 7),
PERIOD = bit.lshift(1, 8),
STREAM_ID = bit.lshift(1, 9),
RAW = bit.lshift(1, 10),
BRANCH_STACK = bit.lshift(1, 11),
REGS_USER = bit.lshift(1, 12),
STACK_USER = bit.lshift(1, 13),
WEIGHT = bit.lshift(1, 14),
DATA_SRC = bit.lshift(1, 15),
IDENTIFIER = bit.lshift(1, 16),
TRANSACTION = bit.lshift(1, 17),
REGS_INTR = bit.lshift(1, 18),
}

-- values to program into perf_event_attr.branch_sample_type when PERF_SAMPLE_BRANCH is set
c.PERF_SAMPLE_BRANCH = multiflags {
USER_SHIFT = 0,
KERNEL_SHIFT = 1,
HV_SHIFT = 2,
ANY_SHIFT = 3,
ANY_CALL_SHIFT = 4,
ANY_RETURN_SHIFT = 5,
IND_CALL_SHIFT = 6,
ABORT_TX_SHIFT = 7,
IN_TX_SHIFT = 8,
NO_TX_SHIFT = 9,
COND_SHIFT = 10,
CALL_STACK_SHIFT = 11,
IND_JUMP_SHIFT = 12,
CALL_SHIFT = 13,
NO_FLAGS_SHIFT = 14,
NO_CYCLES_SHIFT = 15,
}
c.PERF_SAMPLE_BRANCH.USER = bit.lshift(1, c.PERF_SAMPLE_BRANCH.USER_SHIFT)
c.PERF_SAMPLE_BRANCH.KERNEL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.KERNEL_SHIFT)
c.PERF_SAMPLE_BRANCH.HV = bit.lshift(1, c.PERF_SAMPLE_BRANCH.HV_SHIFT)
c.PERF_SAMPLE_BRANCH.ANY = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_SHIFT)
c.PERF_SAMPLE_BRANCH.ANY_CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_CALL_SHIFT)
c.PERF_SAMPLE_BRANCH.ANY_RETURN = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_RETURN_SHIFT)
c.PERF_SAMPLE_BRANCH.IND_CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IND_CALL_SHIFT)
c.PERF_SAMPLE_BRANCH.ABORT_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ABORT_TX_SHIFT)
c.PERF_SAMPLE_BRANCH.IN_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IN_TX_SHIFT)
c.PERF_SAMPLE_BRANCH.NO_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_TX_SHIFT)
c.PERF_SAMPLE_BRANCH.COND = bit.lshift(1, c.PERF_SAMPLE_BRANCH.COND_SHIFT)
c.PERF_SAMPLE_BRANCH.CALL_STACK = bit.lshift(1, c.PERF_SAMPLE_BRANCH.CALL_STACK_SHIFT)
c.PERF_SAMPLE_BRANCH.IND_JUMP = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IND_JUMP_SHIFT)
c.PERF_SAMPLE_BRANCH.CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.CALL_SHIFT)
c.PERF_SAMPLE_BRANCH.NO_FLAGS = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_FLAGS_SHIFT)
c.PERF_SAMPLE_BRANCH.NO_CYCLES = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_CYCLES_SHIFT)

-- Flags for perf_attr.read_format
c.PERF_READ_FORMAT = multiflags {
TOTAL_TIME_ENABLED = bit.lshift(1, 0),
TOTAL_TIME_RUNNING = bit.lshift(1, 1),
ID = bit.lshift(1, 2),
GROUP = bit.lshift(1, 3),
}

-- Flags for perf_event_open
c.PERF_FLAG = multiflags {
FD_NO_GROUP = bit.lshift(1, 0),
FD_OUTPUT = bit.lshift(1, 1),
PID_CGROUP = bit.lshift(1, 2),
FD_CLOEXEC = bit.lshift(1, 3),
}


-- If perf_event_attr.sample_id_all is set then all event types will
-- have the sample_type selected fields related to where/when
-- (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU, IDENTIFIER)
c.PERF_RECORD = strflag {
MMAP = 1,
LOST = 2,
COMM = 3,
EXIT = 4,
THROTTLE = 5,
UNTHROTTLE = 6,
FORK = 7,
READ = 8,
SAMPLE = 9,
MMAP2 = 10,
AUX = 11,
ITRACE_START = 12,
LOST_SAMPLES = 13,
SWITCH = 14,
SWITCH_CPU_WIDE= 15,
}

-- termios - c_cc characters
c.CC = strflag(arch.CC or {
VINTR = 0,
Expand Down
95 changes: 95 additions & 0 deletions syscall/linux/ffi.lua
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,101 @@ union bpf_attr {
uint32_t bpf_fd;
};
} __attribute__((aligned(8)));
struct perf_event_attr {
uint32_t pe_type;
uint32_t size;
uint64_t pe_config;
union {
uint64_t sample_period;
uint64_t sample_freq;
};
uint64_t pe_sample_type;
uint64_t read_format;
uint32_t disabled:1,
inherit:1,
pinned:1,
exclusive:1,
exclude_user:1,
exclude_kernel:1,
exclude_hv:1,
exclude_idle:1,
mmap:1,
comm:1,
freq:1,
inherit_stat:1,
enable_on_exec:1,
task:1,
watermark:1,
precise_ip:2,
mmap_data:1,
sample_id_all:1,
exclude_host:1,
exclude_guest:1,
exclude_callchain_kernel:1,
exclude_callchain_user:1,
mmap2:1,
comm_exec:1,
use_clockid:1,
__reserved_1a:6;
uint32_t __reserved_1b;
union {
uint32_t wakeup_events;
uint32_t wakeup_watermark;
};
uint32_t bp_type;
union {
uint64_t bp_addr;
uint64_t config1;
};
union {
uint64_t bp_len;
uint64_t config2;
};
uint64_t branch_sample_type;
uint64_t sample_regs_user;
uint32_t sample_stack_user;
int32_t clockid;
uint64_t sample_regs_intr;
uint32_t aux_watermark;
uint32_t __reserved_2;
};
struct perf_event_mmap_page {
uint32_t version;
uint32_t compat_version;
uint32_t lock;
uint32_t index;
int64_t offset;
uint64_t time_enabled;
uint64_t time_running;
union {
uint64_t capabilities;
struct {
uint32_t cap_bit0 : 1,
cap_bit0_is_deprecated : 1,
cap_user_rdpmc : 1,
cap_user_time : 1,
cap_user_time_zero : 1;
};
};
uint16_t pmc_width;
uint16_t time_shift;
uint32_t time_mult;
uint64_t time_offset;
uint64_t __reserved[120];
volatile uint64_t data_head;
volatile uint64_t data_tail;
volatile uint64_t data_offset;
volatile uint64_t data_size;
uint64_t aux_head;
uint64_t aux_tail;
uint64_t aux_offset;
uint64_t aux_size;
};
struct perf_event_header {
uint32_t type;
uint16_t misc;
uint16_t size;
};
struct mq_attr {
long mq_flags, mq_maxmsg, mq_msgsize, mq_curmsgs, __unused[4];
};
Expand Down
10 changes: 10 additions & 0 deletions syscall/linux/ioctl.lua
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,16 @@ local ioctl = strflag {
-- from linux/vfio.h type is ';' base is 100
VFIO_GET_API_VERSION = vfio('NONE', 0),
VFIO_CHECK_EXTENSION = vfio('WRITE', 1, "uint32"),
-- from linux/perf_event.h
PERF_EVENT_IOC_ENABLE = _IO('$', 0),
PERF_EVENT_IOC_DISABLE = _IO('$', 1),
PERF_EVENT_IOC_REFRESH = _IO('$', 2),
PERF_EVENT_IOC_RESET = _IO('$', 3),
PERF_EVENT_IOC_PERIOD = _IOW('$', 4, "uint64"),
PERF_EVENT_IOC_SET_OUTPUT= _IO('$', 5),
PERF_EVENT_IOC_SET_FILTER= _IOW('$', 6, "uintptr"),
PERF_EVENT_IOC_ID = _IOR('$', 7, "uint64_1"),
PERF_EVENT_IOC_SET_BPF = _IOW('$', 8, "uint32"),

-- allow user defined ioctls
_IO = _IO,
Expand Down
80 changes: 80 additions & 0 deletions syscall/linux/syscalls.lua
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,86 @@ if C.bpf then
end
end

-- Linux performance monitoring
if C.perf_event_open then
-- Open perf event fd
-- @note see man 2 perf_event_open
-- @return fd, err
function S.perf_event_open(attr, pid, cpu, group_fd, flags)
if attr[0].size == 0 then attr[0].size = ffi.sizeof(attr[0]) end
local fd = C.perf_event_open(attr, pid or 0, cpu or -1, group_fd or -1, c.PERF_FLAG[flags or 0])
if fd < 0 then
return nil, t.error(errno())
end
return retfd(fd)
end
-- Read the tracepoint configuration (see "/sys/kernel/debug/tracing/available_events")
-- @param event_path path to tracepoint (e.g. "/sys/kernel/debug/tracing/events/syscalls/sys_enter_write")
-- @return tp, err (e.g. 538, nil)
function S.perf_tracepoint(event_path)
local config = nil
event_path = event_path.."/id"
local fd, err = S.open(event_path, c.O.RDONLY)
if fd then
local ret, err = fd:read(nil, 256)
if ret then
config = tonumber(ret)
end
fd:close()
end
return config, err
end
-- Attach or detach a probe, same semantics as Lua tables.
-- See https://www.kernel.org/doc/Documentation/trace/kprobetrace.txt
-- (When the definition is not nil, it will be created, otherwise it will be detached)
-- @param probe_type either "kprobe" or "uprobe", no other probe types are supported
-- @param name chosen probe name (e.g. "myprobe")
-- @param definition (set to nil to disable probe) (e.g. "do_sys_open $retval")
-- @param retval true/false if this should be entrypoint probe or return probe
-- @return tp, err (e.g. 1099, nil)
function S.perf_probe(probe_type, name, definition, retval)
local event_path = string.format('/sys/kernel/debug/tracing/%s_events', probe_type)
local probe_path = string.format('/sys/kernel/debug/tracing/events/%ss/%s', probe_type, name)
-- Check if probe already exists
if definition and S.statfs(probe_path) then return nil, t.error(c.E.EEXIST) end
local fd, err = S.open(event_path, "wronly, append")
if not fd then return nil, err end
-- Format a probe definition
if not definition then
definition = "-:"..name -- Detach
else
definition = string.format("%s:%s %s", retval and "r" or "p", name, definition)
end
local ok, err = fd:write(definition)
fd:close()
-- Return tracepoint or success
if ok and definition then
return S.perf_tracepoint(probe_path)
end
return ok, err
end
-- Attach perf event reader to tracepoint (see "/sys/kernel/debug/tracing/available_events")
-- @param tp tracepoint identifier (e.g.: 538, use `S.perf_tracepoint()`)
-- @param type perf_attr.sample_type (default: "raw")
-- @param attrs table of attributes (e.g. {sample_type="raw, callchain"}, see `struct perf_event_attr`)
-- @return reader, err
function S.perf_attach_tracepoint(tp, pid, cpu, group_fd, attrs)
local pe = t.perf_event_attr1()
pe[0].type = "tracepoint"
pe[0].config = tp
pe[0].sample_type = "raw"
pe[0].sample_period = 1
pe[0].wakeup_events = 1
if attrs then
for k,v in pairs(attrs) do pe[0][k] = v end
end
-- Open perf event reader with given parameters
local fd, err = S.perf_event_open(pe, pid, cpu, group_fd, "fd_cloexec")
if not fd then return nil, err end
return t.perf_reader(fd)
end
end

return S

end
Expand Down
Loading

0 comments on commit f245114

Please sign in to comment.