|
| 1 | +#include "rbs/defines.h" |
1 | 2 | #include "rbs/lexer.h" |
| 3 | +#include "rbs/util/rbs_assert.h" |
2 | 4 |
|
3 | 5 | static const char *RBS_TOKENTYPE_NAMES[] = { |
4 | 6 | "NullType", |
@@ -105,17 +107,60 @@ int rbs_token_bytes(rbs_token_t tok) { |
105 | 107 | } |
106 | 108 |
|
107 | 109 | unsigned int rbs_peek(rbs_lexer_t *lexer) { |
108 | | - if (lexer->current.char_pos == lexer->end_pos) { |
109 | | - lexer->last_char = '\0'; |
110 | | - return 0; |
| 110 | + return lexer->current_code_point; |
| 111 | +} |
| 112 | + |
| 113 | +bool rbs_next_char(rbs_lexer_t *lexer, unsigned int *codepoint, size_t *byte_len) { |
| 114 | + if (RBS_UNLIKELY(lexer->current.char_pos == lexer->end_pos)) { |
| 115 | + return false; |
| 116 | + } |
| 117 | + |
| 118 | + const char *start = lexer->string.start + lexer->current.byte_pos; |
| 119 | + |
| 120 | + // Fast path for ASCII (single-byte) characters |
| 121 | + if ((unsigned int) *start < 128) { |
| 122 | + *codepoint = (unsigned int) *start; |
| 123 | + *byte_len = 1; |
| 124 | + return true; |
| 125 | + } |
| 126 | + |
| 127 | + *byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start)); |
| 128 | + |
| 129 | + if (*byte_len == 1) { |
| 130 | + *codepoint = (unsigned int) *start; |
111 | 131 | } else { |
112 | | - rbs_string_t str = rbs_string_new( |
113 | | - lexer->string.start + lexer->current.byte_pos, |
114 | | - lexer->string.end |
115 | | - ); |
116 | | - unsigned int c = rbs_utf8_string_to_codepoint(str); |
117 | | - lexer->last_char = c; |
118 | | - return c; |
| 132 | + *codepoint = 12523; // Dummy data for "ル" from "ルビー" (Ruby) in Unicode |
| 133 | + } |
| 134 | + |
| 135 | + return true; |
| 136 | +} |
| 137 | + |
| 138 | +void rbs_skip(rbs_lexer_t *lexer) { |
| 139 | + rbs_assert(lexer->current_character_bytes > 0, "rbs_skip called with current_character_bytes == 0"); |
| 140 | + |
| 141 | + if (RBS_UNLIKELY(lexer->current_code_point == '\0')) { |
| 142 | + return; |
| 143 | + } |
| 144 | + |
| 145 | + unsigned int codepoint; |
| 146 | + size_t byte_len; |
| 147 | + |
| 148 | + lexer->current.byte_pos += lexer->current_character_bytes; |
| 149 | + lexer->current.char_pos += 1; |
| 150 | + if (lexer->current_code_point == '\n') { |
| 151 | + lexer->current.line += 1; |
| 152 | + lexer->current.column = 0; |
| 153 | + lexer->first_token_of_line = true; |
| 154 | + } else { |
| 155 | + lexer->current.column += 1; |
| 156 | + } |
| 157 | + |
| 158 | + if (rbs_next_char(lexer, &codepoint, &byte_len)) { |
| 159 | + lexer->current_code_point = codepoint; |
| 160 | + lexer->current_character_bytes = byte_len; |
| 161 | + } else { |
| 162 | + lexer->current_character_bytes = 1; |
| 163 | + lexer->current_code_point = '\0'; |
119 | 164 | } |
120 | 165 | } |
121 | 166 |
|
@@ -149,35 +194,8 @@ rbs_token_t rbs_next_eof_token(rbs_lexer_t *lexer) { |
149 | 194 | } |
150 | 195 | } |
151 | 196 |
|
152 | | -void rbs_skip(rbs_lexer_t *lexer) { |
153 | | - if (!lexer->last_char) { |
154 | | - rbs_peek(lexer); |
155 | | - } |
156 | | - |
157 | | - size_t byte_len; |
158 | | - |
159 | | - if (lexer->last_char == '\0') { |
160 | | - byte_len = 1; |
161 | | - } else { |
162 | | - const char *start = lexer->string.start + lexer->current.byte_pos; |
163 | | - byte_len = lexer->encoding->char_width((const uint8_t *) start, (ptrdiff_t) (lexer->string.end - start)); |
164 | | - } |
165 | | - |
166 | | - lexer->current.char_pos += 1; |
167 | | - lexer->current.byte_pos += byte_len; |
168 | | - |
169 | | - if (lexer->last_char == '\n') { |
170 | | - lexer->current.line += 1; |
171 | | - lexer->current.column = 0; |
172 | | - lexer->first_token_of_line = true; |
173 | | - } else { |
174 | | - lexer->current.column += 1; |
175 | | - } |
176 | | -} |
177 | | - |
178 | 197 | void rbs_skipn(rbs_lexer_t *lexer, size_t size) { |
179 | 198 | for (size_t i = 0; i < size; i++) { |
180 | | - rbs_peek(lexer); |
181 | 199 | rbs_skip(lexer); |
182 | 200 | } |
183 | 201 | } |
|
0 commit comments