|
1 |
| -import math |
| 1 | +""" |
| 2 | +The MD5 algorithm is a hash function that's commonly used as a checksum to |
| 3 | +detect data corruption. The algorithm works by processing a given message in |
| 4 | +blocks of 512 bits, padding the message as needed. It uses the blocks to operate |
| 5 | +a 128-bit state and performs a total of 64 such operations. Note that all values |
| 6 | +are little-endian, so inputs are converted as needed. |
2 | 7 |
|
| 8 | +Although MD5 was used as a cryptographic hash function in the past, it's since |
| 9 | +been cracked, so it shouldn't be used for security purposes. |
3 | 10 |
|
4 |
| -def rearrange(bit_string_32): |
5 |
| - """[summary] |
6 |
| - Regroups the given binary string. |
| 11 | +For more info, see https://en.wikipedia.org/wiki/MD5 |
| 12 | +""" |
| 13 | + |
| 14 | +from collections.abc import Generator |
| 15 | +from math import sin |
| 16 | + |
| 17 | + |
| 18 | +def to_little_endian(string_32: bytes) -> bytes: |
| 19 | + """ |
| 20 | + Converts the given string to little-endian in groups of 8 chars. |
7 | 21 |
|
8 | 22 | Arguments:
|
9 |
| - bitString32 {[string]} -- [32 bit binary] |
| 23 | + string_32 {[string]} -- [32-char string] |
10 | 24 |
|
11 | 25 | Raises:
|
12 |
| - ValueError -- [if the given string not are 32 bit binary string] |
| 26 | + ValueError -- [input is not 32 char] |
13 | 27 |
|
14 | 28 | Returns:
|
15 |
| - [string] -- [32 bit binary string] |
16 |
| - >>> rearrange('1234567890abcdfghijklmnopqrstuvw') |
17 |
| - 'pqrstuvwhijklmno90abcdfg12345678' |
| 29 | + 32-char little-endian string |
| 30 | + >>> to_little_endian(b'1234567890abcdfghijklmnopqrstuvw') |
| 31 | + b'pqrstuvwhijklmno90abcdfg12345678' |
| 32 | + >>> to_little_endian(b'1234567890') |
| 33 | + Traceback (most recent call last): |
| 34 | + ... |
| 35 | + ValueError: Input must be of length 32 |
18 | 36 | """
|
| 37 | + if len(string_32) != 32: |
| 38 | + raise ValueError("Input must be of length 32") |
19 | 39 |
|
20 |
| - if len(bit_string_32) != 32: |
21 |
| - raise ValueError("Need length 32") |
22 |
| - new_string = "" |
| 40 | + little_endian = b"" |
23 | 41 | for i in [3, 2, 1, 0]:
|
24 |
| - new_string += bit_string_32[8 * i : 8 * i + 8] |
25 |
| - return new_string |
| 42 | + little_endian += string_32[8 * i : 8 * i + 8] |
| 43 | + return little_endian |
| 44 | + |
| 45 | + |
| 46 | +def reformat_hex(i: int) -> bytes: |
| 47 | + """ |
| 48 | + Converts the given non-negative integer to hex string. |
26 | 49 |
|
| 50 | + Example: Suppose the input is the following: |
| 51 | + i = 1234 |
27 | 52 |
|
28 |
| -def reformat_hex(i): |
29 |
| - """[summary] |
30 |
| - Converts the given integer into 8-digit hex number. |
| 53 | + The input is 0x000004d2 in hex, so the little-endian hex string is |
| 54 | + "d2040000". |
31 | 55 |
|
32 | 56 | Arguments:
|
33 |
| - i {[int]} -- [integer] |
| 57 | + i {[int]} -- [integer] |
| 58 | +
|
| 59 | + Raises: |
| 60 | + ValueError -- [input is negative] |
| 61 | +
|
| 62 | + Returns: |
| 63 | + 8-char little-endian hex string |
| 64 | +
|
| 65 | + >>> reformat_hex(1234) |
| 66 | + b'd2040000' |
34 | 67 | >>> reformat_hex(666)
|
35 |
| - '9a020000' |
| 68 | + b'9a020000' |
| 69 | + >>> reformat_hex(0) |
| 70 | + b'00000000' |
| 71 | + >>> reformat_hex(1234567890) |
| 72 | + b'd2029649' |
| 73 | + >>> reformat_hex(1234567890987654321) |
| 74 | + b'b11c6cb1' |
| 75 | + >>> reformat_hex(-1) |
| 76 | + Traceback (most recent call last): |
| 77 | + ... |
| 78 | + ValueError: Input must be non-negative |
36 | 79 | """
|
| 80 | + if i < 0: |
| 81 | + raise ValueError("Input must be non-negative") |
37 | 82 |
|
38 |
| - hexrep = format(i, "08x") |
39 |
| - thing = "" |
| 83 | + hex_rep = format(i, "08x")[-8:] |
| 84 | + little_endian_hex = b"" |
40 | 85 | for i in [3, 2, 1, 0]:
|
41 |
| - thing += hexrep[2 * i : 2 * i + 2] |
42 |
| - return thing |
| 86 | + little_endian_hex += hex_rep[2 * i : 2 * i + 2].encode("utf-8") |
| 87 | + return little_endian_hex |
43 | 88 |
|
44 | 89 |
|
45 |
| -def pad(bit_string): |
46 |
| - """[summary] |
47 |
| - Fills up the binary string to a 512 bit binary string |
| 90 | +def preprocess(message: bytes) -> bytes: |
| 91 | + """ |
| 92 | + Preprocesses the message string: |
| 93 | + - Convert message to bit string |
| 94 | + - Pad bit string to a multiple of 512 chars: |
| 95 | + - Append a 1 |
| 96 | + - Append 0's until length = 448 (mod 512) |
| 97 | + - Append length of original message (64 chars) |
| 98 | +
|
| 99 | + Example: Suppose the input is the following: |
| 100 | + message = "a" |
| 101 | +
|
| 102 | + The message bit string is "01100001", which is 8 bits long. Thus, the |
| 103 | + bit string needs 439 bits of padding so that |
| 104 | + (bit_string + "1" + padding) = 448 (mod 512). |
| 105 | + The message length is "000010000...0" in 64-bit little-endian binary. |
| 106 | + The combined bit string is then 512 bits long. |
48 | 107 |
|
49 | 108 | Arguments:
|
50 |
| - bitString {[string]} -- [binary string] |
| 109 | + message {[string]} -- [message string] |
51 | 110 |
|
52 | 111 | Returns:
|
53 |
| - [string] -- [binary string] |
| 112 | + processed bit string padded to a multiple of 512 chars |
| 113 | +
|
| 114 | + >>> preprocess(b"a") == (b"01100001" + b"1" + |
| 115 | + ... (b"0" * 439) + b"00001000" + (b"0" * 56)) |
| 116 | + True |
| 117 | + >>> preprocess(b"") == b"1" + (b"0" * 447) + (b"0" * 64) |
| 118 | + True |
54 | 119 | """
|
55 |
| - start_length = len(bit_string) |
56 |
| - bit_string += "1" |
| 120 | + bit_string = b"" |
| 121 | + for char in message: |
| 122 | + bit_string += format(char, "08b").encode("utf-8") |
| 123 | + start_len = format(len(bit_string), "064b").encode("utf-8") |
| 124 | + |
| 125 | + # Pad bit_string to a multiple of 512 chars |
| 126 | + bit_string += b"1" |
57 | 127 | while len(bit_string) % 512 != 448:
|
58 |
| - bit_string += "0" |
59 |
| - last_part = format(start_length, "064b") |
60 |
| - bit_string += rearrange(last_part[32:]) + rearrange(last_part[:32]) |
| 128 | + bit_string += b"0" |
| 129 | + bit_string += to_little_endian(start_len[32:]) + to_little_endian(start_len[:32]) |
| 130 | + |
61 | 131 | return bit_string
|
62 | 132 |
|
63 | 133 |
|
64 |
| -def get_block(bit_string): |
65 |
| - """[summary] |
66 |
| - Iterator: |
67 |
| - Returns by each call a list of length 16 with the 32 bit |
68 |
| - integer blocks. |
| 134 | +def get_block_words(bit_string: bytes) -> Generator[list[int], None, None]: |
| 135 | + """ |
| 136 | + Splits bit string into blocks of 512 chars and yields each block as a list |
| 137 | + of 32-bit words |
| 138 | +
|
| 139 | + Example: Suppose the input is the following: |
| 140 | + bit_string = |
| 141 | + "000000000...0" + # 0x00 (32 bits, padded to the right) |
| 142 | + "000000010...0" + # 0x01 (32 bits, padded to the right) |
| 143 | + "000000100...0" + # 0x02 (32 bits, padded to the right) |
| 144 | + "000000110...0" + # 0x03 (32 bits, padded to the right) |
| 145 | + ... |
| 146 | + "000011110...0" # 0x0a (32 bits, padded to the right) |
| 147 | +
|
| 148 | + Then len(bit_string) == 512, so there'll be 1 block. The block is split |
| 149 | + into 32-bit words, and each word is converted to little endian. The |
| 150 | + first word is interpreted as 0 in decimal, the second word is |
| 151 | + interpreted as 1 in decimal, etc. |
| 152 | +
|
| 153 | + Thus, block_words == [[0, 1, 2, 3, ..., 15]]. |
69 | 154 |
|
70 | 155 | Arguments:
|
71 |
| - bit_string {[string]} -- [binary string >= 512] |
| 156 | + bit_string {[string]} -- [bit string with multiple of 512 as length] |
| 157 | +
|
| 158 | + Raises: |
| 159 | + ValueError -- [length of bit string isn't multiple of 512] |
| 160 | +
|
| 161 | + Yields: |
| 162 | + a list of 16 32-bit words |
| 163 | +
|
| 164 | + >>> test_string = ("".join(format(n << 24, "032b") for n in range(16)) |
| 165 | + ... .encode("utf-8")) |
| 166 | + >>> list(get_block_words(test_string)) |
| 167 | + [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] |
| 168 | + >>> list(get_block_words(test_string * 4)) == [list(range(16))] * 4 |
| 169 | + True |
| 170 | + >>> list(get_block_words(b"1" * 512)) == [[4294967295] * 16] |
| 171 | + True |
| 172 | + >>> list(get_block_words(b"")) |
| 173 | + [] |
| 174 | + >>> list(get_block_words(b"1111")) |
| 175 | + Traceback (most recent call last): |
| 176 | + ... |
| 177 | + ValueError: Input must have length that's a multiple of 512 |
72 | 178 | """
|
| 179 | + if len(bit_string) % 512 != 0: |
| 180 | + raise ValueError("Input must have length that's a multiple of 512") |
73 | 181 |
|
74 |
| - curr_pos = 0 |
75 |
| - while curr_pos < len(bit_string): |
76 |
| - curr_part = bit_string[curr_pos : curr_pos + 512] |
77 |
| - my_splits = [] |
78 |
| - for i in range(16): |
79 |
| - my_splits.append(int(rearrange(curr_part[32 * i : 32 * i + 32]), 2)) |
80 |
| - yield my_splits |
81 |
| - curr_pos += 512 |
| 182 | + for pos in range(0, len(bit_string), 512): |
| 183 | + block = bit_string[pos : pos + 512] |
| 184 | + block_words = [] |
| 185 | + for i in range(0, 512, 32): |
| 186 | + block_words.append(int(to_little_endian(block[i : i + 32]), 2)) |
| 187 | + yield block_words |
82 | 188 |
|
83 | 189 |
|
84 |
| -def not32(i): |
| 190 | +def not_32(i: int) -> int: |
85 | 191 | """
|
86 |
| - >>> not32(34) |
| 192 | + Perform bitwise NOT on given int. |
| 193 | +
|
| 194 | + Arguments: |
| 195 | + i {[int]} -- [given int] |
| 196 | +
|
| 197 | + Raises: |
| 198 | + ValueError -- [input is negative] |
| 199 | +
|
| 200 | + Returns: |
| 201 | + Result of bitwise NOT on i |
| 202 | +
|
| 203 | + >>> not_32(34) |
87 | 204 | 4294967261
|
| 205 | + >>> not_32(1234) |
| 206 | + 4294966061 |
| 207 | + >>> not_32(4294966061) |
| 208 | + 1234 |
| 209 | + >>> not_32(0) |
| 210 | + 4294967295 |
| 211 | + >>> not_32(1) |
| 212 | + 4294967294 |
| 213 | + >>> not_32(-1) |
| 214 | + Traceback (most recent call last): |
| 215 | + ... |
| 216 | + ValueError: Input must be non-negative |
88 | 217 | """
|
| 218 | + if i < 0: |
| 219 | + raise ValueError("Input must be non-negative") |
| 220 | + |
89 | 221 | i_str = format(i, "032b")
|
90 | 222 | new_str = ""
|
91 | 223 | for c in i_str:
|
92 | 224 | new_str += "1" if c == "0" else "0"
|
93 | 225 | return int(new_str, 2)
|
94 | 226 |
|
95 | 227 |
|
96 |
| -def sum32(a, b): |
| 228 | +def sum_32(a: int, b: int) -> int: |
| 229 | + """ |
| 230 | + Add two numbers as 32-bit ints. |
| 231 | +
|
| 232 | + Arguments: |
| 233 | + a {[int]} -- [first given int] |
| 234 | + b {[int]} -- [second given int] |
| 235 | +
|
| 236 | + Returns: |
| 237 | + (a + b) as an unsigned 32-bit int |
| 238 | +
|
| 239 | + >>> sum_32(1, 1) |
| 240 | + 2 |
| 241 | + >>> sum_32(2, 3) |
| 242 | + 5 |
| 243 | + >>> sum_32(0, 0) |
| 244 | + 0 |
| 245 | + >>> sum_32(-1, -1) |
| 246 | + 4294967294 |
| 247 | + >>> sum_32(4294967295, 1) |
| 248 | + 0 |
| 249 | + """ |
97 | 250 | return (a + b) % 2**32
|
98 | 251 |
|
99 | 252 |
|
100 |
| -def leftrot32(i, s): |
101 |
| - return (i << s) ^ (i >> (32 - s)) |
| 253 | +def left_rotate_32(i: int, shift: int) -> int: |
| 254 | + """ |
| 255 | + Rotate the bits of a given int left by a given amount. |
| 256 | +
|
| 257 | + Arguments: |
| 258 | + i {[int]} -- [given int] |
| 259 | + shift {[int]} -- [shift amount] |
| 260 | +
|
| 261 | + Raises: |
| 262 | + ValueError -- [either given int or shift is negative] |
102 | 263 |
|
| 264 | + Returns: |
| 265 | + `i` rotated to the left by `shift` bits |
| 266 | +
|
| 267 | + >>> left_rotate_32(1234, 1) |
| 268 | + 2468 |
| 269 | + >>> left_rotate_32(1111, 4) |
| 270 | + 17776 |
| 271 | + >>> left_rotate_32(2147483648, 1) |
| 272 | + 1 |
| 273 | + >>> left_rotate_32(2147483648, 3) |
| 274 | + 4 |
| 275 | + >>> left_rotate_32(4294967295, 4) |
| 276 | + 4294967295 |
| 277 | + >>> left_rotate_32(1234, 0) |
| 278 | + 1234 |
| 279 | + >>> left_rotate_32(0, 0) |
| 280 | + 0 |
| 281 | + >>> left_rotate_32(-1, 0) |
| 282 | + Traceback (most recent call last): |
| 283 | + ... |
| 284 | + ValueError: Input must be non-negative |
| 285 | + >>> left_rotate_32(0, -1) |
| 286 | + Traceback (most recent call last): |
| 287 | + ... |
| 288 | + ValueError: Shift must be non-negative |
| 289 | + """ |
| 290 | + if i < 0: |
| 291 | + raise ValueError("Input must be non-negative") |
| 292 | + if shift < 0: |
| 293 | + raise ValueError("Shift must be non-negative") |
| 294 | + return ((i << shift) ^ (i >> (32 - shift))) % 2**32 |
| 295 | + |
| 296 | + |
| 297 | +def md5_me(message: bytes) -> bytes: |
| 298 | + """ |
| 299 | + Returns the 32-char MD5 hash of a given message. |
103 | 300 |
|
104 |
| -def md5me(test_string): |
105 |
| - """[summary] |
106 |
| - Returns a 32-bit hash code of the string 'testString' |
| 301 | + Reference: https://en.wikipedia.org/wiki/MD5#Algorithm |
107 | 302 |
|
108 | 303 | Arguments:
|
109 |
| - testString {[string]} -- [message] |
| 304 | + message {[string]} -- [message] |
| 305 | +
|
| 306 | + Returns: |
| 307 | + 32-char MD5 hash string |
| 308 | +
|
| 309 | + >>> md5_me(b"") |
| 310 | + b'd41d8cd98f00b204e9800998ecf8427e' |
| 311 | + >>> md5_me(b"The quick brown fox jumps over the lazy dog") |
| 312 | + b'9e107d9d372bb6826bd81d3542a419d6' |
| 313 | + >>> md5_me(b"The quick brown fox jumps over the lazy dog.") |
| 314 | + b'e4d909c290d0fb1ca068ffaddf22cbd0' |
| 315 | +
|
| 316 | + >>> import hashlib |
| 317 | + >>> from string import ascii_letters |
| 318 | + >>> msgs = [b"", ascii_letters.encode("utf-8"), "Üñîçø∂é".encode("utf-8"), |
| 319 | + ... b"The quick brown fox jumps over the lazy dog."] |
| 320 | + >>> all(md5_me(msg) == hashlib.md5(msg).hexdigest().encode("utf-8") for msg in msgs) |
| 321 | + True |
110 | 322 | """
|
111 | 323 |
|
112 |
| - bs = "" |
113 |
| - for i in test_string: |
114 |
| - bs += format(ord(i), "08b") |
115 |
| - bs = pad(bs) |
| 324 | + # Convert to bit string, add padding and append message length |
| 325 | + bit_string = preprocess(message) |
116 | 326 |
|
117 |
| - tvals = [int(2**32 * abs(math.sin(i + 1))) for i in range(64)] |
| 327 | + added_consts = [int(2**32 * abs(sin(i + 1))) for i in range(64)] |
118 | 328 |
|
| 329 | + # Starting states |
119 | 330 | a0 = 0x67452301
|
120 | 331 | b0 = 0xEFCDAB89
|
121 | 332 | c0 = 0x98BADCFE
|
122 | 333 | d0 = 0x10325476
|
123 | 334 |
|
124 |
| - s = [ |
| 335 | + shift_amounts = [ |
125 | 336 | 7,
|
126 | 337 | 12,
|
127 | 338 | 17,
|
@@ -188,51 +399,46 @@ def md5me(test_string):
|
188 | 399 | 21,
|
189 | 400 | ]
|
190 | 401 |
|
191 |
| - for m in get_block(bs): |
| 402 | + # Process bit string in chunks, each with 16 32-char words |
| 403 | + for block_words in get_block_words(bit_string): |
192 | 404 | a = a0
|
193 | 405 | b = b0
|
194 | 406 | c = c0
|
195 | 407 | d = d0
|
| 408 | + |
| 409 | + # Hash current chunk |
196 | 410 | for i in range(64):
|
197 | 411 | if i <= 15:
|
198 |
| - # f = (B & C) | (not32(B) & D) |
| 412 | + # f = (b & c) | (not_32(b) & d) # Alternate definition for f |
199 | 413 | f = d ^ (b & (c ^ d))
|
200 | 414 | g = i
|
201 | 415 | elif i <= 31:
|
202 |
| - # f = (D & B) | (not32(D) & C) |
| 416 | + # f = (d & b) | (not_32(d) & c) # Alternate definition for f |
203 | 417 | f = c ^ (d & (b ^ c))
|
204 | 418 | g = (5 * i + 1) % 16
|
205 | 419 | elif i <= 47:
|
206 | 420 | f = b ^ c ^ d
|
207 | 421 | g = (3 * i + 5) % 16
|
208 | 422 | else:
|
209 |
| - f = c ^ (b | not32(d)) |
| 423 | + f = c ^ (b | not_32(d)) |
210 | 424 | g = (7 * i) % 16
|
211 |
| - dtemp = d |
| 425 | + f = (f + a + added_consts[i] + block_words[g]) % 2**32 |
| 426 | + a = d |
212 | 427 | d = c
|
213 | 428 | c = b
|
214 |
| - b = sum32(b, leftrot32((a + f + tvals[i] + m[g]) % 2**32, s[i])) |
215 |
| - a = dtemp |
216 |
| - a0 = sum32(a0, a) |
217 |
| - b0 = sum32(b0, b) |
218 |
| - c0 = sum32(c0, c) |
219 |
| - d0 = sum32(d0, d) |
| 429 | + b = sum_32(b, left_rotate_32(f, shift_amounts[i])) |
| 430 | + |
| 431 | + # Add hashed chunk to running total |
| 432 | + a0 = sum_32(a0, a) |
| 433 | + b0 = sum_32(b0, b) |
| 434 | + c0 = sum_32(c0, c) |
| 435 | + d0 = sum_32(d0, d) |
220 | 436 |
|
221 | 437 | digest = reformat_hex(a0) + reformat_hex(b0) + reformat_hex(c0) + reformat_hex(d0)
|
222 | 438 | return digest
|
223 | 439 |
|
224 | 440 |
|
225 |
| -def test(): |
226 |
| - assert md5me("") == "d41d8cd98f00b204e9800998ecf8427e" |
227 |
| - assert ( |
228 |
| - md5me("The quick brown fox jumps over the lazy dog") |
229 |
| - == "9e107d9d372bb6826bd81d3542a419d6" |
230 |
| - ) |
231 |
| - print("Success.") |
232 |
| - |
233 |
| - |
234 | 441 | if __name__ == "__main__":
|
235 |
| - test() |
236 | 442 | import doctest
|
237 | 443 |
|
238 | 444 | doctest.testmod()
|
0 commit comments