Skip to content

Commit cf5b6a2

Browse files
committed
Rewrite parser add lexer step
Closes #69
1 parent 353671b commit cf5b6a2

File tree

9 files changed

+635
-253
lines changed

9 files changed

+635
-253
lines changed

README.md

+7-7
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,13 @@ https://kthompson.github.io/glob/
3535

3636
### Common Expressions
3737

38-
| Pattern | Description |
39-
|-----------|--------------------------------------------------------------------------------|
40-
| taco* | matches any string beginning with taco |
41-
| \*taco\* | matches any string containing taco |
42-
| *taco | matches any string ending in taco |
43-
| *.[ch] | matches any string ending in `.c` or `.h` |
44-
| *.{gif,jpg} | match any string ending in `.gif` or `.jpg` |
38+
| Pattern | Description |
39+
|-------------|---------------------------------------------|
40+
| taco* | matches any string beginning with taco |
41+
| \*taco\* | matches any string containing taco |
42+
| *taco | matches any string ending in taco |
43+
| *.[ch] | matches any string ending in `.c` or `.h` |
44+
| *.{gif,jpg} | match any string ending in `.gif` or `.jpg` |
4545

4646
### Expressions
4747

src/Glob/Glob.cs

+1-5
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
using System;
2-
using System.Collections.Generic;
3-
using System.IO;
42
using System.Linq;
5-
using System.Text;
6-
using System.Text.RegularExpressions;
73
using GlobExpressions.AST;
84

95
namespace GlobExpressions
@@ -51,7 +47,7 @@ public bool IsMatch(string input)
5147
if (_matchFilenameOnly && _segments!.Length == 1)
5248
{
5349
var last = pathSegments.LastOrDefault();
54-
var tail = (last == null) ? new string[0] : new[] { last };
50+
var tail = last == null ? Array.Empty<string>() : new[] { last };
5551

5652
if (GlobEvaluator.Eval(_segments, 0, tail, 0, _caseSensitive))
5753
return true;

src/Glob/Glob.csproj

+4
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,8 @@
5454
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
5555
</PropertyGroup>
5656

57+
<ItemGroup>
58+
<Folder Include="Text" />
59+
</ItemGroup>
60+
5761
</Project>

src/Glob/Lexer.cs

+305
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
6+
namespace GlobExpressions;
7+
8+
enum SyntaxKind
9+
{
10+
EndOfInputToken,
11+
12+
CloseParenToken,
13+
OpenParenToken,
14+
OpenBraceToken, // {
15+
CloseBraceToken, // }
16+
17+
CharacterSet, // [...]
18+
19+
QuestionToken, // ?
20+
StarToken, // *
21+
StarStarToken, // **
22+
SlashToken,
23+
CommaToken,
24+
25+
RootToken,
26+
LiteralToken,
27+
}
28+
29+
record Token(SyntaxKind Kind, TextSpan Span, object Value);
30+
31+
class Lexer
32+
{
33+
private readonly string _pattern;
34+
private int _position;
35+
private readonly Dictionary<char, Func<Token>> _lexFunctions = new();
36+
private readonly HashSet<char> _nonIdentChars = new();
37+
private readonly StringBuilder _currentIdentifier = new();
38+
39+
// state variable representing start position or -1 for none
40+
private int _literalSetPos = -1;
41+
42+
public Lexer(string pattern)
43+
{
44+
_pattern = pattern;
45+
InitializeLexer();
46+
}
47+
48+
private void InitializeLexer()
49+
{
50+
_lexFunctions['{'] = ReturnOpenBraceToken;
51+
_lexFunctions['}'] = ReturnCloseBraceToken;
52+
_lexFunctions['['] = ReturnCharacterSetToken;
53+
_lexFunctions[']'] = ReturnCloseBracketToken;
54+
_lexFunctions['?'] = ReturnQuestionToken;
55+
_lexFunctions['*'] = ReturnStarToken;
56+
_lexFunctions['/'] = ReturnSlashToken;
57+
_lexFunctions[','] = ReturnCommaToken;
58+
59+
foreach (char key in _lexFunctions.Keys)
60+
_nonIdentChars.Add(key);
61+
}
62+
63+
private void CheckExtendedGlob(char current)
64+
{
65+
if (current != '(') return;
66+
67+
// stub support for extended globs if we ever want to support it
68+
switch (Lookahead)
69+
{
70+
case '?':
71+
case '*':
72+
case '+':
73+
case '@':
74+
case '!':
75+
throw new GlobPatternException("Extended glob patterns are not currently supported");
76+
77+
default:
78+
return;
79+
}
80+
}
81+
82+
private Token ReturnOpenBraceToken()
83+
{
84+
if (InLiteralSet)
85+
throw new GlobPatternException($"Invalid nested literal set at offset {_position}");
86+
87+
_literalSetPos = _position;
88+
return ReturnKindOneChar(SyntaxKind.OpenBraceToken);
89+
}
90+
91+
private Token ReturnCloseBraceToken()
92+
{
93+
if (!InLiteralSet)
94+
throw new GlobPatternException($"Invalid literal set terminator at offset {_position}");
95+
96+
_literalSetPos = -1;
97+
return ReturnKindOneChar(SyntaxKind.CloseBraceToken);
98+
}
99+
100+
private Token ReturnCloseBracketToken()
101+
{
102+
throw new GlobPatternException($"Invalid character set terminator at offset {_position}");
103+
}
104+
105+
private Token ReturnCharacterSetToken()
106+
{
107+
_position++; // accept [
108+
109+
var start = _position;
110+
var inverted = false;
111+
112+
113+
if (Current == null)
114+
{
115+
throw new GlobPatternException($"Unterminated character set at offset {start}");
116+
}
117+
118+
if(Current.Value == '!')
119+
{
120+
_position++;
121+
start++; // dont count the `!` in the character set
122+
inverted = true;
123+
}
124+
125+
if (Current == null)
126+
{
127+
throw new GlobPatternException($"Unterminated character set at offset {start}");
128+
}
129+
130+
// first token is special and we allow more things like ] or [ at the beginning
131+
if (Current.Value == ']')
132+
{
133+
_position++;
134+
}
135+
136+
while (true)
137+
{
138+
if (Current == null)
139+
{
140+
throw new GlobPatternException($"Unterminated character set at offset {start}");
141+
}
142+
143+
if (Current.Value != ']')
144+
{
145+
_position++;
146+
continue;
147+
}
148+
149+
break;
150+
}
151+
152+
var token = new Token(SyntaxKind.CharacterSet, TextSpan.FromBounds(start, _position), inverted);
153+
154+
_position++; // accept `]`
155+
156+
return token;
157+
}
158+
159+
private bool InLiteralSet => _literalSetPos >= 0;
160+
161+
private Token ReturnQuestionToken() => ReturnKindOneChar(SyntaxKind.QuestionToken);
162+
163+
private Token ReturnStarToken() => Lookahead == '*'
164+
? ReturnKindTwoChar(SyntaxKind.StarStarToken)
165+
: ReturnKindOneChar(SyntaxKind.StarToken);
166+
167+
private Token ReturnCommaToken() => ReturnKindOneChar(SyntaxKind.CommaToken);
168+
private Token ReturnSlashToken() => ReturnKindOneChar(SyntaxKind.SlashToken);
169+
170+
private Token ReturnEndOfInput() =>
171+
new Token(SyntaxKind.EndOfInputToken, TextSpan.FromBounds(_position, _position), string.Empty);
172+
private Token ReturnKindOneChar(SyntaxKind kind)
173+
{
174+
var start = _position;
175+
_position++;
176+
return new Token(kind, TextSpan.FromBounds(start, _position), string.Empty);
177+
}
178+
179+
private Token ReturnKindTwoChar(SyntaxKind kind)
180+
{
181+
var start = _position;
182+
_position += 2;
183+
return new Token(kind, TextSpan.FromBounds(start, _position), string.Empty);
184+
}
185+
186+
private char? Current => Peek(_position);
187+
private char? Lookahead => Peek(_position + 1);
188+
private char? Peek(int position) => position >= _pattern.Length ? null : _pattern[position];
189+
190+
private bool IsIdentCharacter(char current, bool inCharacterSet)
191+
{
192+
// if we are in a literal set we parse commas as their own token, otherwise
193+
// they are considered an identifier character
194+
if (current == ',')
195+
return !InLiteralSet;
196+
197+
// character wildcards are treated as ident characters in character sets
198+
if (current == '?')
199+
return inCharacterSet;
200+
201+
return !_nonIdentChars.Contains(current);
202+
}
203+
204+
private Token ParseIdentToken(bool inCharacterSet)
205+
{
206+
var start = _position;
207+
_currentIdentifier.Clear();
208+
209+
while (true)
210+
{
211+
if (Current == null)
212+
break;
213+
214+
if (Current == '\\')
215+
{
216+
var escapeSequence = ParseEscapeSequence(inCharacterSet);
217+
_currentIdentifier.Append(escapeSequence);
218+
}
219+
else if (IsIdentCharacter(Current.Value, inCharacterSet))
220+
{
221+
_currentIdentifier.Append(Current.Value);
222+
_position++;
223+
}
224+
else
225+
{
226+
break;
227+
}
228+
}
229+
230+
return new Token(SyntaxKind.LiteralToken, TextSpan.FromBounds(start, _position), _currentIdentifier.ToString());
231+
}
232+
233+
private string ParseEscapeSequence(bool inCharacterSet)
234+
{
235+
_position++; // accept \
236+
switch (Current)
237+
{
238+
case '*':
239+
case '?':
240+
case '{':
241+
case '}':
242+
case '[':
243+
case ']':
244+
case '(':
245+
case ')':
246+
case ' ':
247+
case ',' when inCharacterSet:
248+
var result = Current.Value.ToString();
249+
_position++;
250+
return result;
251+
252+
default:
253+
throw new GlobPatternException(
254+
$"Expected escape sequence at index pattern `{_position}` but found `\\{Current}`");
255+
}
256+
}
257+
258+
public Token ParseToken()
259+
{
260+
if (Current == null) return ReturnEndOfInput();
261+
262+
if (_position == 0)
263+
{
264+
var token = TryParseRootToken();
265+
if (token != null)
266+
return token;
267+
}
268+
269+
CheckExtendedGlob(Current.Value);
270+
271+
if (IsIdentCharacter(Current.Value, false))
272+
return ParseIdentToken(false);
273+
274+
if (_lexFunctions.TryGetValue(Current.Value, out var function))
275+
{
276+
return function();
277+
}
278+
279+
throw new GlobPatternException($"Unexpected character {Current} at index {_position}");
280+
}
281+
282+
private Token? TryParseRootToken()
283+
{
284+
if (Current == null)
285+
{
286+
return null;
287+
}
288+
289+
// osx/linux root
290+
if (Current == '/')
291+
{
292+
_position += 1;
293+
return new Token(SyntaxKind.RootToken, TextSpan.FromBounds(0, 0), string.Empty);
294+
}
295+
296+
// windows root
297+
if (char.IsLetter(Current.Value) && Lookahead == ':')
298+
{
299+
_position += 2;
300+
return new Token(SyntaxKind.RootToken, TextSpan.FromBounds(0, 2), string.Empty);
301+
}
302+
303+
return null;
304+
}
305+
}

0 commit comments

Comments
 (0)