Skip to content

Commit 404e220

Browse files
committed
use functions to avoid duplicated asm
makes it more clear what clobbers registers also results in one redundant move is removed for rdpmc_pair
1 parent 326c6a4 commit 404e220

File tree

1 file changed

+47
-63
lines changed

1 file changed

+47
-63
lines changed

measureme/src/counters.rs

Lines changed: 47 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -525,45 +525,22 @@ mod hw {
525525
/// the width of the register (32 to 64 bits, e.g. 48-bit seems common).
526526
#[inline(always)]
527527
fn rdpmc(reg_idx: u32) -> u64 {
528-
let (lo, hi): (u32, u32);
529-
unsafe {
530-
// NOTE(eddyb) below comment is outdated (the other branch uses `cpuid`).
531-
if cfg!(unserialized_rdpmc) && false {
532-
// FIXME(eddyb) the Intel and AMD manuals warn about the need for
533-
// "serializing instructions" before/after `rdpmc`, if avoiding any
534-
// reordering is desired, but do not agree on the full set of usable
535-
// "serializing instructions" (e.g. `mfence` isn't listed in both).
536-
//
537-
// The only usable, and guaranteed to work, "serializing instruction"
538-
// appears to be `cpuid`, but it doesn't seem easy to use, especially
539-
// due to the overlap in registers with `rdpmc` itself, and it might
540-
// have too high of a cost, compared to serialization benefits (if any).
541-
asm!("rdpmc", in("ecx") reg_idx, out("eax") lo, out("edx") hi, options(nostack));
542-
} else {
543-
asm!(
544-
// Dummy `cpuid(0)` to serialize instruction execution.
545-
"xor eax, eax",
546-
// LLVM sometimes reserves `ebx` for its internal use, we so we need to use
547-
// a scratch register for it instead.
548-
"mov {tmp_rbx:r}, rbx",
549-
"cpuid",
550-
"mov rbx, {tmp_rbx:r}",
551-
552-
"mov ecx, {rdpmc_ecx:e}",
553-
"rdpmc",
554-
rdpmc_ecx = in(reg) reg_idx,
555-
tmp_rbx = out(reg) _,
556-
out("eax") lo,
557-
out("edx") hi,
558-
559-
// `cpuid` clobbers (not overwritten by `rdpmc`).
560-
out("ecx") _,
561-
562-
options(nostack),
563-
);
564-
}
528+
// NOTE(eddyb) below comment is outdated (the other branch uses `cpuid`).
529+
if cfg!(unserialized_rdpmc) && false {
530+
// FIXME(eddyb) the Intel and AMD manuals warn about the need for
531+
// "serializing instructions" before/after `rdpmc`, if avoiding any
532+
// reordering is desired, but do not agree on the full set of usable
533+
// "serializing instructions" (e.g. `mfence` isn't listed in both).
534+
//
535+
// The only usable, and guaranteed to work, "serializing instruction"
536+
// appears to be `cpuid`, but it doesn't seem easy to use, especially
537+
// due to the overlap in registers with `rdpmc` itself, and it might
538+
// have too high of a cost, compared to serialization benefits (if any).
539+
unserialized_rdpmc(reg_idx)
540+
} else {
541+
serialize_instruction_execution();
542+
unserialized_rdpmc(reg_idx)
565543
}
566-
lo as u64 | (hi as u64) << 32
567544
}
568545

569546
/// Read two hardware performance counters at once (see `rdpmc`).
@@ -572,42 +549,49 @@ mod hw {
572549
/// only requires one "serializing instruction", rather than two.
573550
#[inline(always)]
574551
fn rdpmc_pair(a_reg_idx: u32, b_reg_idx: u32) -> (u64, u64) {
575-
let (a_lo, a_hi): (u32, u32);
576-
let (b_lo, b_hi): (u32, u32);
552+
serialize_instruction_execution();
553+
(unserialized_rdpmc(a_reg_idx), unserialized_rdpmc(b_reg_idx))
554+
}
555+
556+
/// Dummy `cpuid(0)` to serialize instruction execution.
557+
#[inline(always)]
558+
fn serialize_instruction_execution() {
577559
unsafe {
578560
asm!(
579-
// Dummy `cpuid(0)` to serialize instruction execution.
580561
"xor eax, eax",
581-
// LLVM sometimes reserves `ebx` for its internal use, we so we need to use
562+
// LLVM sometimes reserves `ebx` for its internal use, so we need to use
582563
// a scratch register for it instead.
583564
"mov {tmp_rbx:r}, rbx",
584565
"cpuid",
585566
"mov rbx, {tmp_rbx:r}",
586-
587-
"mov ecx, {a_rdpmc_ecx:e}",
588-
"rdpmc",
589-
"mov {a_rdpmc_eax:e}, eax",
590-
"mov {a_rdpmc_edx:e}, edx",
591-
"mov ecx, {b_rdpmc_ecx:e}",
592-
"rdpmc",
593-
a_rdpmc_ecx = in(reg) a_reg_idx,
594-
a_rdpmc_eax = out(reg) a_lo,
595-
a_rdpmc_edx = out(reg) a_hi,
596-
b_rdpmc_ecx = in(reg) b_reg_idx,
597-
tmp_rbx = out(reg) _,
598-
out("eax") b_lo,
599-
out("edx") b_hi,
600-
601-
// `cpuid` clobbers (not overwritten by `rdpmc`).
602-
out("ecx") _,
567+
tmp_rbx = lateout(reg) _,
568+
// `cpuid` clobbers.
569+
lateout("eax") _,
570+
lateout("edx") _,
571+
lateout("ecx") _,
603572

604573
options(nostack),
605574
);
606575
}
607-
(
608-
a_lo as u64 | (a_hi as u64) << 32,
609-
b_lo as u64 | (b_hi as u64) << 32,
610-
)
576+
}
577+
578+
/// Read the hardware performance counter indicated by `reg_idx`.
579+
///
580+
/// If the counter is signed, sign extension should be performed based on
581+
/// the width of the register (32 to 64 bits, e.g. 48-bit seems common).
582+
#[inline(always)]
583+
fn unserialized_rdpmc(reg_idx: u32) -> u64 {
584+
let (lo, hi): (u32, u32);
585+
unsafe {
586+
asm!(
587+
"rdpmc",
588+
in("ecx") reg_idx,
589+
lateout("eax") lo,
590+
lateout("edx") hi,
591+
options(nostack)
592+
);
593+
}
594+
lo as u64 | (hi as u64) << 32
611595
}
612596

613597
/// Categorization of `x86_64` CPUs, primarily based on how they

0 commit comments

Comments
 (0)