Windows: Faster `getenvW` and a standalone environment variable test #23272

squeek502 · 2025-03-17T01:02:33Z

Inspired by #23265, I thought I'd try applying the same strategy to the Windows implementation. The strategy itself didn't end up having the same benefit, but optimizations were still found while exploring it.

Also adds a standalone test to make sure the functionality remains the same.

Both sliceTo and indexOfScalarPos use SIMD when available to speed up the search. On my x86_64 machine, this leads to getenvW being around 2-3x faster overall.

Additionally, any future improvements to sliceTo/indexOfScalarPos will benefit getenvW.

Benchmark code

const std = @import("std");

pub fn main() !void {
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();
    const allocator = arena.allocator();

    const Bench = enum {
        all,
        found,
        short,
        long,
        random,
        empty,
    };

    var args = try std.process.argsWithAllocator(allocator);
    defer args.deinit();

    _ = args.next();
    const bench: Bench = bench: {
        const str = args.next() orelse break :bench .all;
        break :bench std.meta.stringToEnum(Bench, str) orelse {
            std.debug.print("bench not recognized: {s}\n", .{str});
            std.process.exit(1);
        };
    };

    const num_iterations = 1000000;

    var env_map = try std.process.getEnvMap(allocator);
    defer env_map.deinit();

    var names: std.ArrayListUnmanaged([]const u8) = try .initCapacity(allocator, env_map.count());
    defer names.deinit(allocator);
    var longest_name: usize = 0;
    var it = env_map.iterator();
    while (it.next()) |entry| {
        if (entry.key_ptr.*.len > longest_name) longest_name = entry.key_ptr.*.len;
        names.appendAssumeCapacity(entry.key_ptr.*);
    }

    var name_mod_buf: std.ArrayListUnmanaged(u8) = try .initCapacity(allocator, longest_name + 1);
    defer name_mod_buf.deinit(allocator);

    var timer = try std.time.Timer.start();
    var prng = std.Random.DefaultPrng.init(0);
    const rand = prng.random();

    if (bench == .found or bench == .all) {
        const elapsed = elapsed: {
            timer.reset();
            for (0..num_iterations) |_| {
                const name = names.items[rand.uintLessThan(usize, names.items.len)];
                const value = try std.process.getEnvVarOwned(allocator, name);
                std.mem.doNotOptimizeAway(&value);
            }
            break :elapsed timer.read();
        };
        std.debug.print("found: {}/lookup\n", .{std.fmt.fmtDuration(elapsed / num_iterations)});
    }

    if (bench == .long or bench == .all) {
        const elapsed = elapsed: {
            timer.reset();
            for (0..num_iterations) |_| {
                const name = names.items[rand.uintLessThan(usize, names.items.len)];
                name_mod_buf.clearRetainingCapacity();
                name_mod_buf.appendSliceAssumeCapacity(name);
                // Append a random ascii character
                name_mod_buf.appendAssumeCapacity(rand.int(u7));
                const value = std.process.getEnvVarOwned(allocator, name_mod_buf.items) catch |err| switch (err) {
                    error.EnvironmentVariableNotFound => continue,
                    error.InvalidWtf8 => unreachable,
                    error.OutOfMemory => |e| return e,
                };
                std.mem.doNotOptimizeAway(&value);
            }
            break :elapsed timer.read();
        };
        std.debug.print("one char too long: {}/lookup\n", .{std.fmt.fmtDuration(elapsed / num_iterations)});
    }

    if (bench == .short or bench == .all) {
        const elapsed = elapsed: {
            timer.reset();
            for (0..num_iterations) |_| {
                const name = names.items[rand.uintLessThan(usize, names.items.len)];
                const name_trunc = name[0 .. name.len - 1];
                const value = std.process.getEnvVarOwned(allocator, name_trunc) catch |err| switch (err) {
                    error.EnvironmentVariableNotFound => continue,
                    error.InvalidWtf8 => unreachable,
                    error.OutOfMemory => |e| return e,
                };
                std.mem.doNotOptimizeAway(&value);
            }
            break :elapsed timer.read();
        };
        std.debug.print("one char too short: {}/lookup\n", .{std.fmt.fmtDuration(elapsed / num_iterations)});
    }

    if (bench == .random or bench == .all) {
        const elapsed = elapsed: {
            timer.reset();
            for (0..num_iterations) |_| {
                const len = rand.uintAtMost(usize, longest_name);
                name_mod_buf.items.len = len;
                for (name_mod_buf.items) |*c| {
                    c.* = rand.int(u7);
                }
                const value = std.process.getEnvVarOwned(allocator, name_mod_buf.items) catch |err| switch (err) {
                    error.EnvironmentVariableNotFound => continue,
                    error.InvalidWtf8 => unreachable,
                    error.OutOfMemory => |e| return e,
                };
                std.mem.doNotOptimizeAway(&value);
            }
            break :elapsed timer.read();
        };
        std.debug.print("random ascii: {}/lookup\n", .{std.fmt.fmtDuration(elapsed / num_iterations)});
    }

    if (bench == .empty or bench == .all) {
        const elapsed = elapsed: {
            timer.reset();
            for (0..num_iterations) |_| {
                const value = std.process.getEnvVarOwned(allocator, "") catch |err| switch (err) {
                    error.EnvironmentVariableNotFound => continue,
                    error.InvalidWtf8 => unreachable,
                    error.OutOfMemory => |e| return e,
                };
                std.mem.doNotOptimizeAway(&value);
            }
            break :elapsed timer.read();
        };
        std.debug.print("empty name: {}/lookup\n", .{std.fmt.fmtDuration(elapsed / num_iterations)});
    }
}

// all environment variable lookups are found
  'benchenv.exe found' ran
    2.50 ± 0.62 times faster than 'benchenv-master.exe found'

// all environment variable lookups have their name truncated by one byte
  'benchenv.exe short' ran
    2.85 ± 0.44 times faster than 'benchenv-master.exe short'

// all environment variable lookups have an extra ASCII character added to the name
  'benchenv.exe long' ran
    2.59 ± 0.54 times faster than 'benchenv-master.exe long'

// all environment variable lookups are random strings of ASCII characters
  'benchenv.exe random' ran
    3.13 ± 0.58 times faster than 'benchenv-master.exe random'

// looking up a zero-length string as the name
  'benchenv.exe empty' ran
    3.26 ± 0.42 times faster than 'benchenv-master.exe empty'

(presumably, the magnitude of any speedup would also (on average) increase as the number of environment variables in the environment increases)

Initially, I tried using the same early return loop strategy as #23265, but later realized that the switch to using sliceTo to find the NUL terminator was accounting for pretty much all of the speed gains. The early return loop version can be found here:

squeek502@573e707

The version currently in this PR is slightly faster and easier to understand IMO, so that's what I went with.

benchmark results comparing the implementations

benchenv-sliceto-indexof.exe is the implementation in this PR currently, benchenv-loop.exe is squeek502@573e707

Benchmark 1: benchenv-master.exe found
  Time (mean ± σ):      1.390 s ±  0.338 s    [User: 1.364 s, System: 0.028 s]
  Range (min … max):    0.912 s …  1.797 s    10 runs

Benchmark 2: benchenv-loop.exe found
  Time (mean ± σ):     635.9 ms ±  22.0 ms    [User: 614.1 ms, System: 24.7 ms]
  Range (min … max):   608.2 ms … 669.3 ms    10 runs

Benchmark 3: benchenv-sliceto-indexof.exe found
  Time (mean ± σ):     556.1 ms ±  23.0 ms    [User: 534.4 ms, System: 18.4 ms]
  Range (min … max):   522.0 ms … 582.0 ms    10 runs

Summary
  'benchenv-sliceto-indexof.exe found' ran
    1.14 ± 0.06 times faster than 'benchenv-loop.exe found'
    2.50 ± 0.62 times faster than 'benchenv-master.exe found'



Benchmark 1: benchenv-master.exe short
  Time (mean ± σ):      2.352 s ±  0.361 s    [User: 2.333 s, System: 0.008 s]
  Range (min … max):    1.639 s …  2.784 s    10 runs

Benchmark 2: benchenv-loop.exe short
  Time (mean ± σ):     930.4 ms ±  33.4 ms    [User: 918.4 ms, System: 9.1 ms]
  Range (min … max):   888.9 ms … 981.1 ms    10 runs

Benchmark 3: benchenv-sliceto-indexof.exe short
  Time (mean ± σ):     826.5 ms ±  23.7 ms    [User: 820.0 ms, System: 10.6 ms]
  Range (min … max):   784.0 ms … 857.3 ms    10 runs

Summary
  'benchenv-sliceto-indexof.exe short' ran
    1.13 ± 0.05 times faster than 'benchenv-loop.exe short'
    2.85 ± 0.44 times faster than 'benchenv-master.exe short'



Benchmark 1: benchenv-master.exe long
  Time (mean ± σ):      2.179 s ±  0.446 s    [User: 2.158 s, System: 0.008 s]
  Range (min … max):    1.652 s …  2.762 s    10 runs

Benchmark 2: benchenv-loop.exe long
  Time (mean ± σ):     950.2 ms ±  24.2 ms    [User: 936.6 ms, System: 9.7 ms]
  Range (min … max):   913.0 ms … 981.4 ms    10 runs

Benchmark 3: benchenv-sliceto-indexof.exe long
  Time (mean ± σ):     839.9 ms ±  29.1 ms    [User: 830.3 ms, System: 6.2 ms]
  Range (min … max):   779.9 ms … 879.0 ms    10 runs

Summary
  'benchenv-sliceto-indexof.exe long' ran
    1.13 ± 0.05 times faster than 'benchenv-loop.exe long'
    2.59 ± 0.54 times faster than 'benchenv-master.exe long'



Benchmark 1: benchenv-master.exe random
  Time (mean ± σ):      2.314 s ±  0.420 s    [User: 2.301 s, System: 0.008 s]
  Range (min … max):    1.667 s …  2.878 s    10 runs

Benchmark 2: benchenv-loop.exe random
  Time (mean ± σ):     764.2 ms ±  37.0 ms    [User: 755.3 ms, System: 8.4 ms]
  Range (min … max):   717.5 ms … 813.3 ms    10 runs

Benchmark 3: benchenv-sliceto-indexof.exe random
  Time (mean ± σ):     739.4 ms ±  28.5 ms    [User: 728.8 ms, System: 16.2 ms]
  Range (min … max):   705.9 ms … 780.1 ms    10 runs

Summary
  'benchenv-sliceto-indexof.exe random' ran
    1.03 ± 0.06 times faster than 'benchenv-loop.exe random'
    3.13 ± 0.58 times faster than 'benchenv-master.exe random'



Benchmark 1: benchenv-master.exe empty
  Time (mean ± σ):      2.362 s ±  0.292 s    [User: 2.355 s, System: 0.006 s]
  Range (min … max):    1.625 s …  2.662 s    10 runs

Benchmark 2: benchenv-loop.exe empty
  Time (mean ± σ):     732.4 ms ±  24.1 ms    [User: 726.9 ms, System: 6.9 ms]
  Range (min … max):   691.5 ms … 768.2 ms    10 runs

Benchmark 3: benchenv-sliceto-indexof.exe empty
  Time (mean ± σ):     725.1 ms ±  29.9 ms    [User: 717.5 ms, System: 8.5 ms]
  Range (min … max):   687.2 ms … 768.2 ms    10 runs

Summary
  'benchenv-sliceto-indexof.exe empty' ran
    1.01 ± 0.05 times faster than 'benchenv-loop.exe empty'
    3.26 ± 0.42 times faster than 'benchenv-master.exe empty'

rootbeer

Very nice tests! I've got a bunch of comments, but they're mostly suggestions for more work, so feel free to ignore them as this seems like an improvement to me as-is.

Oh, one more suggestion: maybe expand the comment on getenvW to say that it does linear searches and if the caller is going to frequently fetch environment variables, it may be worthwhile to use std.process.getEnvMap to be able to query for them more efficiently?

test/standalone/env_vars/build.zig

test/standalone/env_vars/main.zig

lib/std/process.zig

test/standalone/env_vars/main.zig

lib/std/process.zig

squeek502 · 2025-03-17T21:19:33Z

Oh, one more suggestion: maybe expand the comment on getenvW to say that it does linear searches and if the caller is going to frequently fetch environment variables, it may be worthwhile to use std.process.getEnvMap to be able to query for them more efficiently?

My hunch is that you'd have to be querying them absurdly frequently before it starts mattering, and I'm assuming the lack of allocation in getenvW makes up for any linear-scan-based-inefficiency. That comment may be appropriate for any of the other environment variable getting functions that take an allocator, but it'd probably still be worth benchmarking to validate.

…uired This code previously added 4 NUL code units, but that was likely due to a misinterpretation of this part of the CreateProcess documentation: > A Unicode environment block is terminated by four zero bytes: two for the last string, two more to terminate the block. (four zero *bytes* means *two* zero code units) Additionally, the second zero code unit is only actually needed when the environment is empty due to a quirk of the CreateProcess implementation. In the case of a non-empty environment, there always ends up being two trailing NUL code units since one will come after the last environment variable in the block.

rootbeer · 2025-03-18T01:52:26Z

Looks great to me!

Tests all environment variable APIs in std.process

Both sliceTo and indexOfScalarPos use SIMD when available to speed up the search. On my x86_64 machine, this leads to getenvW being around 2-3x faster overall. Additionally, any future improvements to sliceTo/indexOfScalarPos will benefit getenvW.

alexrp

Seems okay from a quick glance. Feel free to merge when you're happy with it.

squeek502 changed the title ~~Windows: Faster getenvW and standalone environment variable test~~ Windows: Faster getenvW and a standalone environment variable test Mar 17, 2025

squeek502 force-pushed the getenvw-optim branch 2 times, most recently from 976ece0 to ebbf214 Compare March 17, 2025 01:23

rootbeer reviewed Mar 17, 2025

View reviewed changes

squeek502 added 2 commits March 17, 2025 17:53

std.process: Allow WTF-8 in env var functions with comptime-known keys

b2cc408

squeek502 force-pushed the getenvw-optim branch from ebbf214 to 90fc9ce Compare March 18, 2025 00:57

squeek502 added 3 commits March 22, 2025 15:44

Add standalone test for environment variables

752e7c0

Tests all environment variable APIs in std.process

windows: Document Environment pointer

78ecf3b

squeek502 force-pushed the getenvw-optim branch from 90fc9ce to 66dcebc Compare March 22, 2025 23:03

This was referenced Mar 22, 2025

std.posix.getenv: early-return comparison #23265

Open

What's the deal with = in environment variable names? #23331

Open

alexrp approved these changes Mar 24, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Windows: Faster `getenvW` and a standalone environment variable test #23272

Windows: Faster `getenvW` and a standalone environment variable test #23272

squeek502 commented Mar 17, 2025 •

edited

Loading

rootbeer left a comment

squeek502 commented Mar 17, 2025

rootbeer commented Mar 18, 2025

alexrp left a comment

Windows: Faster getenvW and a standalone environment variable test #23272

Are you sure you want to change the base?

Windows: Faster getenvW and a standalone environment variable test #23272

Conversation

squeek502 commented Mar 17, 2025 • edited Loading

rootbeer left a comment

Choose a reason for hiding this comment

squeek502 commented Mar 17, 2025

rootbeer commented Mar 18, 2025

alexrp left a comment

Choose a reason for hiding this comment

Windows: Faster `getenvW` and a standalone environment variable test #23272

Windows: Faster `getenvW` and a standalone environment variable test #23272

squeek502 commented Mar 17, 2025 •

edited

Loading