Skip to content

Commit d8e7d0a

Browse files
committed
Parse abbreviated month name to ordinal at compile time
1 parent 9d3f9c5 commit d8e7d0a

File tree

4 files changed

+287
-0
lines changed

4 files changed

+287
-0
lines changed

.vscode/settings.json

+17
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
{
2+
"terminal.integrated.env.osx": {
3+
"PYTHONPATH": "${workspaceFolder}"
4+
},
5+
"terminal.integrated.env.windows": {
6+
"PYTHONPATH": "${workspaceFolder}"
7+
},
8+
"editor.codeActionsOnSave": {
9+
"source.organizeImports": "explicit"
10+
},
211
"editor.formatOnSave": true,
12+
"files.insertFinalNewline": true,
313
"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
414
"C_Cpp.formatting": "vcFormat",
515
"C_Cpp.vcFormat.space.pointerReferenceAlignment": "left",
@@ -10,5 +20,12 @@
1020
"tzoffset"
1121
],
1222
"cmake.configureOnOpen": true,
23+
"isort.args": [
24+
"--profile",
25+
"black"
26+
],
27+
"[python]": {
28+
"editor.defaultFormatter": "ms-python.black-formatter"
29+
},
1330
"C_Cpp.vcFormat.newLine.beforeElse": false
1431
}

include/simdparse/datetime.hpp

+111
Original file line numberDiff line numberDiff line change
@@ -968,4 +968,115 @@ namespace simdparse
968968
/** Microseconds before/after epoch. */
969969
int64_t _value = UNSET;
970970
};
971+
972+
namespace detail
973+
{
974+
/**
975+
* True if the character may be an ASCII letter character.
976+
*
977+
* Uppercase letters in ASCII have the binary representation of 0b010xxxxx.
978+
* Lowercase letters have the binary representation of 0b011xxxxx.
979+
*/
980+
inline constexpr bool maybe_letter(char c)
981+
{
982+
return (c & 0b11000000) == 0b01000000;
983+
}
984+
985+
/**
986+
* Compacts ASCII characters of the abbreviated English month name into a numeric value.
987+
*
988+
* This function is case insensitive as it takes into account the lowest 5 bits only.
989+
*/
990+
inline constexpr std::uint16_t month_to_integer(char c1, char c2, char c3)
991+
{
992+
return
993+
static_cast<std::uint16_t>(c1 & 0b00011111) << 10 |
994+
static_cast<std::uint16_t>(c2 & 0b00011111) << 5 |
995+
static_cast<std::uint16_t>(c3 & 0b00011111)
996+
;
997+
}
998+
999+
/**
1000+
* Holds the numeric offset associated with each abbreviated month name.
1001+
*
1002+
* 0 for January, 1 for February, ..., 11 for December, 15 for invalid data.
1003+
*/
1004+
constexpr static inline std::uint8_t month_offsets[] = {
1005+
7, 6, 4, 8, 9, 11, 2, 3, 0, 5, 10, 1, 15, 15, 15, 15
1006+
};
1007+
1008+
/**
1009+
* Holds the numeric value associated with each abbreviated month name.
1010+
*/
1011+
constexpr static inline std::uint16_t month_values[] = {
1012+
month_to_integer('J', 'a', 'n'),
1013+
month_to_integer('F', 'e', 'b'),
1014+
month_to_integer('M', 'a', 'r'),
1015+
month_to_integer('A', 'p', 'r'),
1016+
month_to_integer('M', 'a', 'y'),
1017+
month_to_integer('J', 'u', 'n'),
1018+
month_to_integer('J', 'u', 'l'),
1019+
month_to_integer('A', 'u', 'g'),
1020+
month_to_integer('S', 'e', 'p'),
1021+
month_to_integer('O', 'c', 't'),
1022+
month_to_integer('N', 'o', 'v'),
1023+
month_to_integer('D', 'e', 'c'),
1024+
0, 0, 0, 0
1025+
};
1026+
}
1027+
1028+
/**
1029+
* Converts an abbreviated English month name into an ordinal.
1030+
*
1031+
* @param c1 1st character of the abbreviated English month name, e.g. `J` or `O`.
1032+
* @param c2 2nd character of the abbreviated English month name, e.g. `a` or `c`.
1033+
* @param c3 3rd character of the abbreviated English month name, e.g. `n` or `t`.
1034+
* @returns `1` to `12` for `Jan` to `Dec`, respectively, or `0` on parse error.
1035+
*/
1036+
constexpr inline unsigned int month_to_ordinal(char c1, char c2, char c3)
1037+
{
1038+
using detail::maybe_letter, detail::month_values;
1039+
1040+
// constants for perfect hashing of months into a contiguous array of 12
1041+
constexpr unsigned int k = 68;
1042+
constexpr unsigned int p = 929;
1043+
1044+
// calculate the perfect hash value to get the proper index in the array
1045+
std::uint16_t value = detail::month_to_integer(c1, c2, c3);
1046+
unsigned int index = ((k * value) % p) & 0b00001111;
1047+
1048+
// check for false positives in the lookup table
1049+
unsigned int offset = detail::month_offsets[index];
1050+
if (maybe_letter(c1) && maybe_letter(c2) && maybe_letter(c3) && value == month_values[offset]) {
1051+
return offset + 1;
1052+
} else {
1053+
return 0;
1054+
}
1055+
}
1056+
1057+
/**
1058+
* Converts an abbreviated English month name into an ordinal.
1059+
*
1060+
* @param abbr Abbreviated English month name, e.g. `Jan` or `Oct`.
1061+
* @returns `1` to `12` for `Jan` to `Dec`, respectively, or `0` on parse error.
1062+
*/
1063+
inline unsigned int month_to_ordinal(const std::array<char, 3>& abbr)
1064+
{
1065+
return month_to_ordinal(abbr[0], abbr[1], abbr[2]);
1066+
}
1067+
1068+
/**
1069+
* Converts an abbreviated English month name into an ordinal.
1070+
*
1071+
* @param abbr Abbreviated English month name, e.g. `Jan` or `Oct`.
1072+
* @returns `1` to `12` for `Jan` to `Dec`, respectively, or `0` on parse error.
1073+
*/
1074+
inline unsigned int month_to_ordinal(const std::string_view& abbr)
1075+
{
1076+
if (abbr.size() != 3) {
1077+
return 0;
1078+
} else {
1079+
return month_to_ordinal(abbr[0], abbr[1], abbr[2]);
1080+
}
1081+
}
9711082
}

test/simdparse.cpp

+31
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,37 @@ int main(int /*argc*/, char* /*argv*/[])
353353
check_parse("0xFEDCBA9876543210", hexadecimal_integer(0xfedcba9876543210ull));
354354
check_fail<hexadecimal_integer>("fedcba9876543210a");
355355

356+
using simdparse::month_to_ordinal;
357+
static_assert(month_to_ordinal('J', 'a', 'n') == 1);
358+
static_assert(month_to_ordinal('F', 'e', 'b') == 2);
359+
static_assert(month_to_ordinal('M', 'a', 'r') == 3);
360+
static_assert(month_to_ordinal('A', 'p', 'r') == 4);
361+
static_assert(month_to_ordinal('M', 'a', 'y') == 5);
362+
static_assert(month_to_ordinal('J', 'u', 'n') == 6);
363+
static_assert(month_to_ordinal('J', 'u', 'l') == 7);
364+
static_assert(month_to_ordinal('A', 'u', 'g') == 8);
365+
static_assert(month_to_ordinal('S', 'e', 'p') == 9);
366+
static_assert(month_to_ordinal('O', 'c', 't') == 10);
367+
static_assert(month_to_ordinal('N', 'o', 'v') == 11);
368+
static_assert(month_to_ordinal('D', 'e', 'c') == 12);
369+
370+
static_assert(month_to_ordinal('j', 'a', 'n') == 1);
371+
static_assert(month_to_ordinal('f', 'e', 'b') == 2);
372+
static_assert(month_to_ordinal('m', 'a', 'r') == 3);
373+
static_assert(month_to_ordinal('a', 'p', 'r') == 4);
374+
static_assert(month_to_ordinal('m', 'a', 'y') == 5);
375+
static_assert(month_to_ordinal('j', 'u', 'n') == 6);
376+
static_assert(month_to_ordinal('j', 'u', 'l') == 7);
377+
static_assert(month_to_ordinal('a', 'u', 'g') == 8);
378+
static_assert(month_to_ordinal('s', 'e', 'p') == 9);
379+
static_assert(month_to_ordinal('o', 'c', 't') == 10);
380+
static_assert(month_to_ordinal('n', 'o', 'v') == 11);
381+
static_assert(month_to_ordinal('d', 'e', 'c') == 12);
382+
383+
static_assert(month_to_ordinal(' ', ' ', ' ') == 0);
384+
static_assert(month_to_ordinal('a', 'b', 'c') == 0);
385+
static_assert(month_to_ordinal('x', 'y', 'z') == 0);
386+
356387
// test code examples
357388
if (!example1() || !example2()) {
358389
return 1;

util/month_hash.py

+128
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
"""
2+
Pre-calculates a mapping for perfect hashing of abbreviated English month names.
3+
"""
4+
5+
months = [
6+
"Jan",
7+
"Feb",
8+
"Mar",
9+
"Apr",
10+
"May",
11+
"Jun",
12+
"Jul",
13+
"Aug",
14+
"Sep",
15+
"Oct",
16+
"Nov",
17+
"Dec",
18+
]
19+
20+
21+
def month_to_integer(month: str) -> int:
22+
"Compacts ASCII characters of the abbreviated English month name into a numeric value."
23+
24+
# ignore uppercase/lowercase
25+
chars = [0b00011111 & ord(c) for c in month]
26+
27+
# compact lowest 5 bits into a 16-bit integer
28+
return (chars[0] << 10) | (chars[1] << 5) | chars[2]
29+
30+
31+
# compute a numeric value that represents each abbreviated month name
32+
values = [month_to_integer(month) for month in months]
33+
34+
# fmt: off
35+
primes = [
36+
2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109,
37+
113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239,
38+
241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379,
39+
383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521,
40+
523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661,
41+
673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827,
42+
829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947, 953, 967, 971, 977, 983, 991,
43+
997, 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, 1103, 1109,
44+
1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259,
45+
1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,
46+
1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511, 1523, 1531, 1543,
47+
1549, 1553, 1559, 1567, 1571, 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657, 1663, 1667,
48+
1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823,
49+
1831, 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987,
50+
1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111, 2113,
51+
2129, 2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203, 2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267, 2269, 2273,
52+
2281, 2287, 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357, 2371, 2377, 2381, 2383, 2389, 2393, 2399,
53+
2411, 2417, 2423, 2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531, 2539, 2543, 2549, 2551, 2557, 2579,
54+
2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657, 2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693, 2699, 2707, 2711,
55+
2713, 2719, 2729, 2731, 2741, 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, 2803, 2819, 2833, 2837, 2843, 2851,
56+
2857, 2861, 2879, 2887, 2897, 2903, 2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011, 3019,
57+
3023, 3037, 3041, 3049, 3061, 3067, 3079, 3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167, 3169, 3181, 3187, 3191,
58+
3203, 3209, 3217, 3221, 3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331, 3343,
59+
3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413, 3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511,
60+
3517, 3527, 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607, 3613, 3617, 3623, 3631, 3637,
61+
3643, 3659, 3671, 3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727, 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797,
62+
3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907, 3911, 3917, 3919, 3923, 3929, 3931, 3943,
63+
3947, 3967, 3989, 4001, 4003, 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057, 4073, 4079, 4091, 4093, 4099, 4111,
64+
4127, 4129, 4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211, 4217, 4219, 4229, 4231, 4241, 4243, 4253, 4259, 4261,
65+
4271, 4273, 4283, 4289, 4297, 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409, 4421, 4423, 4441, 4447,
66+
4451, 4457, 4463, 4481, 4483, 4493, 4507, 4513, 4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583, 4591, 4597, 4603,
67+
4621, 4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673, 4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751, 4759, 4783,
68+
4787, 4789, 4793, 4799, 4801, 4813, 4817, 4831, 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937, 4943,
69+
4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003, 5009, 5011, 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087,
70+
5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167, 5171, 5179, 5189, 5197, 5209, 5227, 5231, 5233, 5237, 5261, 5273,
71+
5279, 5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351, 5381, 5387, 5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437,
72+
5441, 5443, 5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521, 5527, 5531, 5557, 5563, 5569, 5573, 5581,
73+
5591, 5623, 5639, 5641, 5647, 5651, 5653, 5657, 5659, 5669, 5683, 5689, 5693, 5701, 5711, 5717, 5737, 5741, 5743,
74+
5749, 5779, 5783, 5791, 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849, 5851, 5857, 5861, 5867, 5869, 5879, 5881,
75+
5897, 5903, 5923, 5927, 5939, 5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073, 6079, 6089,
76+
6091, 6101, 6113, 6121, 6131, 6133, 6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211, 6217, 6221, 6229, 6247, 6257,
77+
6263, 6269, 6271, 6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359, 6361, 6367, 6373, 6379,
78+
6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473, 6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571, 6577,
79+
6581, 6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673, 6679, 6689, 6691, 6701, 6703, 6709, 6719, 6733, 6737, 6761,
80+
6763, 6779, 6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833, 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907, 6911,
81+
6917, 6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997, 7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069,
82+
7079, 7103, 7109, 7121, 7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207, 7211, 7213, 7219, 7229, 7237, 7243, 7247,
83+
7253, 7283, 7297, 7307, 7309, 7321, 7331, 7333, 7349, 7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457, 7459, 7477,
84+
7481, 7487, 7489, 7499, 7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561, 7573, 7577, 7583, 7589, 7591,
85+
7603, 7607, 7621, 7639, 7643, 7649, 7669, 7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723, 7727, 7741, 7753, 7757,
86+
7759, 7789, 7793, 7817, 7823, 7829, 7841, 7853, 7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919, 7927, 7933, 7937,
87+
7949, 7951, 7963, 7993, 8009, 8011, 8017, 8039, 8053, 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111, 8117, 8123,
88+
8147, 8161, 8167, 8171, 8179, 8191, 8209, 8219, 8221, 8231, 8233, 8237, 8243, 8263, 8269, 8273, 8287, 8291, 8293,
89+
8297, 8311, 8317, 8329, 8353, 8363, 8369, 8377, 8387, 8389, 8419, 8423, 8429, 8431, 8443, 8447, 8461, 8467, 8501,
90+
8513, 8521, 8527, 8537, 8539, 8543, 8563, 8573, 8581, 8597, 8599, 8609, 8623, 8627, 8629, 8641, 8647, 8663, 8669,
91+
8677, 8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731, 8737, 8741, 8747, 8753, 8761, 8779, 8783, 8803, 8807, 8819,
92+
8821, 8831, 8837, 8839, 8849, 8861, 8863, 8867, 8887, 8893, 8923, 8929, 8933, 8941, 8951, 8963, 8969, 8971, 8999,
93+
9001, 9007, 9011, 9013, 9029, 9041, 9043, 9049, 9059, 9067, 9091, 9103, 9109, 9127, 9133, 9137, 9151, 9157, 9161,
94+
9173, 9181, 9187, 9199, 9203, 9209, 9221, 9227, 9239, 9241, 9257, 9277, 9281, 9283, 9293, 9311, 9319, 9323, 9337,
95+
9341, 9343, 9349, 9371, 9377, 9391, 9397, 9403, 9413, 9419, 9421, 9431, 9433, 9437, 9439, 9461, 9463, 9467, 9473,
96+
9479, 9491, 9497, 9511, 9521, 9533, 9539, 9547, 9551, 9587, 9601, 9613, 9619, 9623, 9629, 9631, 9643, 9649, 9661,
97+
9677, 9679, 9689, 9697, 9719, 9721, 9733, 9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791, 9803, 9811, 9817, 9829,
98+
9833, 9839, 9851, 9857, 9859, 9871, 9883, 9887, 9901, 9907, 9923, 9929, 9931, 9941, 9949, 9967, 9973
99+
]
100+
# fmt: on
101+
102+
103+
def integer_hash(k: int, p: int, value: int) -> int:
104+
return ((k * value) % p) & 0b00001111
105+
106+
107+
# find a minimal perfect hashing for the abbreviated month names
108+
triplets: list[tuple[int, int, list[int]]] = []
109+
for p in primes:
110+
for k in range(1, 1000):
111+
indices = [integer_hash(k, p, value) for value in values]
112+
113+
# check if hashing is perfect and minimal
114+
if len(set(indices)) == 12 and max(indices) < 12:
115+
triplets.append((p, k, indices))
116+
117+
for p, k, indices in triplets:
118+
print(f"p = {p}, k = {k}, indices = {indices}")
119+
120+
offsets = sorted(
121+
[offset for offset in range(len(months))],
122+
key=lambda offset: integer_hash(k, p, month_to_integer(months[offset])),
123+
)
124+
offsets.extend([15, 15, 15, 15])
125+
print("constexpr static inline std::uint8_t month_offsets[] = {")
126+
print(" " + ", ".join(str(offset) for offset in offsets))
127+
print("};")
128+
print()

0 commit comments

Comments
 (0)