-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRegExpr.pas
4258 lines (3913 loc) · 150 KB
/
RegExpr.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{$B-}
unit RegExpr;
(*
TRegExpr library
Regular Expressions for Delphi
Author:
Andrey V. Sorokin
St-Petersburg
Russia
anso@mail.ru, anso@usa.net
http://anso.da.ru
http://anso.virtualave.net
This library is derived from Henry Spencer sources.
I translated the C sources into Object Pascal,
implemented object wrapper and some new features.
Many features suggested or partially implemented
by TRegExpr's users (see Gratitude below).
---------------------------------------------------------------
Legal issues
---------------------------------------------------------------
Copyright (c) 1999-00 by Andrey V. Sorokin <anso@mail.ru>
This software is provided as it is, without any kind of warranty
given. Use it at your own risk.
You may use this software in any kind of development, including
comercial, redistribute, and modify it freely, under the
following restrictions :
1. The origin of this software may not be mispresented, you must
not claim that you wrote the original software. If you use
this software in any kind of product, it would be appreciated
that there in a information box, or in the documentation would
be an acknowledgmnent like this
Partial Copyright (c) 2000 by Andrey V. Sorokin
2. You may not have any income from distributing this source
to other developers. When you use this product in a comercial
package, the source may not be charged seperatly.
---------------------------------------------------------------
Legal issues for the original C sources:
---------------------------------------------------------------
* Copyright (c) 1986 by University of Toronto.
* Written by Henry Spencer. Not derived from licensed software.
*
* Permission is granted to anyone to use this software for any
* purpose on any computer system, and to redistribute it freely,
* subject to the following restrictions:
* 1. The author is not responsible for the consequences of use of
* this software, no matter how awful, even if they arise
* from defects in it.
* 2. The origin of this software must not be misrepresented, either
* by explicit claim or by omission.
* 3. Altered versions must be plainly marked as such, and must not
* be misrepresented as being the original software.
---------------------------------------------------------------
Gratitudes
---------------------------------------------------------------
Guido Muehlwitz
found and fixed ugly bug in big string processing
Stephan Klimek
testing in CPPB and suggesting/implementing many features
Steve Mudford
implemented Offset parameter
Martin Baur
usefull suggetions, help translation into German
Yury Finkel
Implemented UniCode support, found and fixed some bugs
Ralf Junker
Implemented some features, many optimization suggestions
Filip Jirsák and Matthew Winter (wintermi@yahoo.com)
Help in Implementation non-greedy mode
Kit Eason
many examples for introduction help section
Juergen Schroth
bug hunting and usefull suggestions
Simeon Lilov
help translation into Bulgarian
Martin Ledoux
help translation into French
Diego Calp (mail@diegocalp.com), Argentina
help translation into Spanish
And many others - for big work of bug hunting !
I am still looking for person who can help me to translate
this documentation into other languages (especially German)
---------------------------------------------------------------
To do
---------------------------------------------------------------
-=- VCL-version of TRegExpr - for dummies ;) and TRegExprEdit
(replacement for TMaskEdit).
Actually, I am writing non-VCL aplications (with web-based
interfaces), so I don't need VCL's TRegExpr for myself.
Will it be really usefull ?
-=- working with pascal-style string.
Now pascal-strings converted into PChar, so
you can't find r.e. in strings with #0 -chars.
(suggested by Pavel O).
-=- put precalculated lengths into EXACTLY[CI] !
-=- fInputString as string (suggested by Ralf Junker)
-=- Add regstart optimization for case-insensitive mode ?
Or complitely remove because FirstCharSet is faster ?
-=- "Russian Ranges" --> National ranges (use property WordChars ?
for ordering letters in ranges by its order in WirdsChars if modifier /r is On)
-=- FirstCharSet as array [#0 .. #255] of REChar ?
(2x faster then set of REChar)
-=- p-code optimization (remove BRANCH-to-EEND, COMMENT, BACK(?)
merge EXACTLY etc).
-=- !!!!!!!! bug found by Lars Karlslund
"If I do '(something|^$)' on '' I get false (which is wrong ...)."
-=- There are no special command for files (Johan Smit).
I need your suggestions !
What are more importent in this list ?
Did I forget anything ?
---------------------------------------------------------------
History
---------------------------------------------------------------
Legend:
(+) added feature
(-) fixed bug
(^) upgraded implementation
v. 0.947 2001.10.03
-=- (+) Word boundary (\b & \B) metachar
-=- (-) Bug in processing predefined char.classes in non-UseSetOfChar mode
-=- (+) Spanish help - translated by Diego Calp (mail@diegocalp.com), Argentina
-=- (+) VersionMajor/Minor class method of TRegExpr ;)
-=- (-) Bug in CompileRegExpr, Thanks to Oleg Orlov <orlov@diasoft.ru>
-=- (^) Method RegExprSubExpressions wasn't compatible with D2-D3.
Thanks to Eugene Tarasov for bug report.
-=- (+) Method Replace can now do substitution as well (see documentation)
Thanks to Warren Bare, Ken Friesen and many others who suggested it.
-=- (+) Updated ReplaceRegExpr to use new Replace method functionality
-=- (^) Restored UniCode compatibility lost in some previous version
Thanks to Stephan Klimek for bug report
-=- (^) Updated TestRE project, new examples for Replace with substitution
included.
v. 0.942+ 2001.03.01
-=- (+) Published French help for TRegExpr,
translated by Martin Ledoux
v. 0.942 2001.02.12
-=- (-) Range-check error in DEMO-project (due to bug in
RegExprSubExpressions), Thanks to Juergen Schroth
-=- (^) RegExprSubExpressions - added error codes for "unclosed "[" error
-=- (^) Help file bug fixing
v. 0.941 2001.02.01
-=- (^) Attension! Behaviour of '\w', '\W' was changed! Now it really
match alphanum characters and '_' as described in documentation,
not only alpha as it was before. Thanks to Vadim Alexandrov.
If You want to restore previous behaviour, reassign
RegExprWordChars (exclude '0123456789' from it).
-=- (+) Full compatible with recommended at unicode.org implementation
of modifier /m, including DOS-styled line separators (\r\n) mixed
with Unix styled (\n) - see properties LineSeparators, LinePairedSeparator
-=- (^) Attension! Behaviour of '.' was changed! Now if modifier /s is off
it doesn't match all chars from LineSeparators and LinePairedSeparator (by
default \r and \n)
-=- (^) Attension! To prevent unneeded recompilation of r.e., now assignment
to Expression or changing modifiers doesn't cause immidiate [re]compilation.
So, now You don't get exception while assigning wrong expression, but can
get exception while calling Exec[Next], Substitute, Dump, etc if there
are errors in Expression or other properties.
-=- (+) Non-greedy style iterators (like '*?'), modifier /g.
Implemented with help from Matthew Winter and Filip Jirsák
-=- (+) /x modifier (eXtended syntax - allow formating r.e., see description
in the help)
-=- (+) Procedure Compile to [re]compile r.e. Usefull for GUI r.e. editors
and so on (to check all properties validity).
-=- (+) FAQ in documentation. I am too lazy to answer to the same
questions again and again :( Please, read the FAQ before sending
question to me!
-=- (^) DEMO project have been significantly improved. Now this is the
real r.e. debugger! Thanks to Jon Smith for his ideas.
-=- (+) function RegExprSubExpressions, usefull for GUI editors of
r.e. (see example of using in TestRExp.dpr project)
-=- (+) HyperLinkDecorator unit - practical example of TRegExpr
using (see description in the help file)
-=- (-) Range checking error in some cases if ComplexBraces defined
Thanks to Juergen Schroth
-=- (^) 'ComplexBraces' now is defined by default
-=- (+) Kit Eason sent to me many examples for 'Syntax' help section
and I decided to complitely rewrite this section. I hope, You'll enjoy
the results ;)
-=- (+) The \A and \Z metacharacters are just like "^" and "$", except
that they won't match multiple times when the modifier /m is used
v. 0.939 2000.10.04
-=- (-) Bug in Substitute method ($10.. didn't work properly)
Thanks to Serge S Klochkovski
v. 0.938 2000.07.23
-=- (^) Exeptions now jump to appropriate source line, not
to Error procedure (I am not quite sure this is safe for
all compiler versions. You can turn it off - remove
reRealExceptionAddr definition below).
-=- (^) Forgotten BSUBEXP[CI] in FillFirstCharSet caused
exeption 'memory corruption' in case if back reference can
be first op, like this: (a)*\1 (first subexpression can be
skipped and we'll start matching with back reference..).
v. 0.937 2000.06.12
-=- (-) Bug in optimization engine (since v.0.934). In some cases
TRegExpr didn't catch right strings.
Thanks to Matthias Fichtner
v. 0.936 2000.04.22
-=- (+) Back references, like <font size=(['"]?)(\d+)\1>, see
manual for details
-=- (+) Wide hex char support, like '\x{263a}'
v. 0.935 2000.04.19 (by Yury Finkel)
-=- (-) fInvertCase now isn't readonly ;)
-=- (-) UniCode mode compiling errors
v. 0.934 2000.04.17
-=- (^) New ranges implementation (range matching now is very fast
- uses one(!) CPU instruction)
-=- (^) Internal p-code structure converted into 32-bits - works
faster and now there is no 64K limit for compiled r.e.
-=- (^) '{m,n}' now use 32-bits arguments (up to 2147483646) - specially
for Dmitry Veprintsev ;)
-=- (^) Ranges now support metachars: [\n-\x0D] -> #10,#11,#12,#13;
Changed '-' processing, now it's like in Perl:
[\d-t] -> '0'..'9','-','t'; []-a] -> ']'..'a'
-=- (-) Bug with \t and etc macro (they worked only in ranges)
Thanks to Yury Finkel
-=- (^) Added new preprocessing optimization (see FirstCharSet).
Incredible fast (!). But be carefull it isn's properly tested.
You can switch it Off - remove UseFirstCharSet definition.
-=- (^) Many other speed optimizations
-=- (-) Case-insensitive mode now support system-defined national
charset (due to bug in v.0.90 .. 0.926 supported only english one)
-=- (^) Case-insensitive mode implemented with InvertCase (param &
result of REChar type) - works 10 .. 100 times faster.
-=- (^) Match and ExecNext interfaces optimized, added IsProgrammOk
by Ralf Junker
-=- (^) Increased NSUBEXP (now 15) and fixed code for this, now you
can simply increase NSUBEXP constant by yourself.
Suggested by Alexander V. Akimov.
-=- (^+) Substitute adapted for NSUBEXP > 10 and significant (!)
optimized, improved error checking.
ATTENTION! Read new Substitute description - syntax was changed !
-=- (+) SpaceChars & WordChars property - now you may change chars
treated as \s & \w. By defauled assigned RegExprSpaceChars/WordChars
-=- (+) Now \s and \w supported in ranges
-=- (-) Infinite loop if end of range=#$FF
Thanks to Andrey Kolegov
-=- (+) Function QuoteRegExprMetaChars (see description)
-=- (+) UniCode support - sorry, works VERY slow (remove '.' from
{.$DEFINE UniCode} after this comment for unicode version).
Implemented by Yury Finkel
v. 0.926 2000.02.26
-=- (-) Old bug derived from H.Spencer sources - SPSTART was
set for '?' and '*' instead of '*', '{m,n}' and '+'.
-=- (-^) Now {m,n} works like Perl's one - error occures only
if m > n or n > BracesMax (BracesMax = 255 in this version).
In other cases (no m or nondigit symbols in m or n values,
or no '}') symbol '{' will be compiled as literal.
Note: so, you must include m value (use {0,n} instead of {,n}).
Note: {m,} will be compiled as {m,BracesMax}.
-=- (-^) CaseInsensitive mode now support ranges
'(?i)[a]' == '[aA]'
-=- (^) Roman-number template in TestRExp ;)
-=- (+^) Beta version of complex-braces - like ((abc){1,2}|d){3}
By default its turned off. If you want take part in beta-testing,
please, remove '.' from {.$DEFINE ComplexBraces} below this comments.
-=- (-^) Removed \b metachar (in Perl it isn't BS as in my implementation,
but word bound)
-=- (+) Add /s modifier. Bu I am not sure that it's ok for Windows.
I implemented it as [^\n] for '.' metachar in non-/s mode.
But lines separated by \n\r in windows. I need you suggestions !
-=- (^) Sorry, but I had to rename Modifiers to ModifierStr
(ModifierS uses for /s now)
v. 0.91 2000.02.02
-=- (^) some changes in documentation and demo-project.
v. 0.90 2000.01.31
-=- (+) implemented braces repetitions {min,max}.
Sorry - only simple cases now - like '\d{2,3}'
or '[a-z1-9]{,7}', but not (abc){2,3} ..
I still too short in time.
Wait for future versions of TRegExpr or
implement it by youself and share with me ;)
-=- (+) implemented case-insensitive modifier and way
to work with other modifiers - see properties
Modifiers, Modifier, ModifierI
and (?ismx-ismx) Perl extension.
You may use global variables RegExpr* for assigning
default modifier values.
-=- (+) property ExtSyntaxEnabled changed to 'r'-modifier
(russian extensions - see documentation)
-=- (+) implemented (?#comment) Perl extension - very hard
and usefull work ;)
-=- (^) property MatchCount renamed to SubExprMatchCount.
Sorry for any inconvenients, but it's because new
version works slightly different and if you used
MatchCount in your programms you have to rethink
it ! (see comments to this property)
-=- (+) add InputString property - stores input string
from last Exec call. You may directly assign values
to this property for using in ExecPos method.
-=- (+) add ExecPos method - for working with assigned
to InputString property. You may use it like this
InputString := AString;
ExecPos;
or this
InputString := AString;
ExecPos (AOffset);
Note: ExecPos without parameter works only in
Delphi 4 or higher.
-=- (+) add ExecNext method - simple and fast (!) way to finding
multiple occurences of r.e. in big input string.
-=- (^) Offset parameter removed from Exec method, if you
used it in your programs, please replace all
Exec (AString, AOffset)
with combination
InputString := AString; ExecPos (AOffset)
Sorry for any inconvenients, but old design
(see v.0.81) was too ugly :(
In addition, multiple Exec calls with same input
string produce fool overhead because each Exec
reallocate input string buffer.
-=- (^) optimized implementation of Substitution,
Replace and Split methods
-=- (-) fixed minor bug - if r.e. compilation raise error
during second pass (!!! I think it's impossible
in really practice), TRegExpr stayed in 'compiled'
state.
-=- (-) fixed bug - Dump method didn't check program existance
and raised 'access violation' if previouse Exec
was finished with error.
-=- (+) changed error handling (see functions Error, ErrorMsg,
LastError, property CompilerErrorPos, type ERegExpr).
-=- (-^) TRegExpr.Replace, Split and ExecNext made a infinite
loop in case of r.e. match empty-string.
Now ExecNext moves by MatchLen if MatchLen <> 0
and by +1 if MatchLen = 0
Thanks to Jon Smith and George Tasker for bugreports.
-=- (-) While playing with null-matchs I discovered, that
null-match at tail of input string is never found.
Well, I fixed this, but I am not sure this is safe
(MatchPos[0]=length(AInputString)+1, MatchLen = 0).
Any suggetions are very appreciated.
-=- (^) Demo project and documentation was upgraded
-=- (^) Documentation and this version was published on my home page
http://anso.da.ru
v. 0.81 1999.12.25 // Merry Christmas ! :)
-=- added \s (AnySpace) and \S (NotSpace) meta-symbols
- implemented by Stephan Klimek with minor fixes by AVS
-=- added \f, \a and \b chars (translates into FF, BEL, BS)
-=- removed meta-symbols 'ö' & 'Ö' - sorry for any inconvenients
-=- added Match property (== copy (InputStr, MatchPos [Idx], MatchLen [Idx]))
-=- added extra parameter Offset to Exec method
(thanks to Steve Mudford)
v. 0.7 1999.08.22
-=- fixed bug - in some cases the r.e. [^...]
incorrectly processed (as any symbol)
(thanks to Jan Korycan)
-=- Some changes and improvements in TestRExp.dpr
v. 0.6 1999.08.13 (Friday 13 !)
-=- changed header of TRegExpr.Substitute
-=- added Split, Replace & appropriate
global wrappers (thanks to Stephan Klimek for suggetions)
v. 0.5 1999.08.12
-=- TRegExpr.Substitute routine added
-=- Some changes and improvements in TestRExp.dpr
-=- Fixed bug in english version of documentation
(Thanks to Jon Buckheit)
v. 0.4 1999.07.20
-=- Fixed bug with parsing of strings longer then 255 bytes
(thanks to Guido Muehlwitz)
-=- Fixed bug in RegMatch - mathes only first occurence of r.e.
(thanks to Stephan Klimek)
v. 0.3 1999.06.13
-=- ExecRegExpr function
v. 0.2 1999.06.10
-=- packed into object-pascal class
-=- code slightly rewriten for pascal
-=- now macro correct proceeded in ranges
-=- r.e.ranges syntax extended for russian letters ranges:
à-ÿ - replaced with all small russian letters (Win1251)
À-ß - replaced with all capital russian letters (Win1251)
à-ß - replaced with all russian letters (Win1251)
-=- added macro '\d' (opcode ANYDIGIT) - match any digit
-=- added macro '\D' (opcode NOTDIGIT) - match not digit
-=- added macro '\w' (opcode ANYLETTER) - match any english letter or '_'
-=- added macro '\W' (opcode NOTLETTER) - match not english letter or '_'
(all r.e.syntax extensions may be turned off by flag ExtSyntax)
v. 0.1 1999.06.09
first version, with bugs, without help => must die :(
*)
interface
{$DEFINE DebugRegExpr} // define for dump/trace enabling
{$DEFINE reRealExceptionAddr} // if defined then exceptions will
// jump to appropriate source line, not to Error procedure
{$DEFINE ComplexBraces} // define for beta-version of braces
// (in stable version it works only for simple cases)
{.$DEFINE UniCode} // define for Unicode support
{$IFNDEF UniCode} // optionts applicable only for non-UniCode
{$DEFINE UseSetOfChar} // Significant optimization by using set of char
{$ENDIF}
{$IFDEF UseSetOfChar}
{$DEFINE UseFirstCharSet} // Significant optimization inm some cases
{$ENDIF}
// Determine version (for using 'params by default')
{$IFNDEF VER80} { Delphi 1.0}
{$IFNDEF VER90} { Delphi 2.0}
{$IFNDEF VER93} { C++Builder 1.0}
{$IFNDEF VER100} { Borland Delphi 3.0}
{$DEFINE D4_} { Delphi 4.0 or higher}
{$ENDIF}
{$ENDIF}
{$ENDIF}
{$ENDIF}
{.$IFNDEF VER110} { Borland C++Builder 3.0}
{.$IFNDEF VER120} {Borland Delphi 4.0}
uses
Classes, // TStrings in Split method
SysUtils; // Exception
type
{$IFDEF UniCode}
PRegExprChar = PWideChar;
RegExprString = WideString;
REChar = WideChar;
{$ELSE}
PRegExprChar = PChar;
RegExprString = string;
REChar = Char;
{$ENDIF}
TREOp = REChar; // internal p-code type //###0.933
PREOp = ^TREOp;
TRENextOff = integer; // internal Next "pointer" (offset to current p-code) //###0.933
PRENextOff = ^TRENextOff; // used for extracting Next "pointers" from compiled r.e. //###0.933
TREBracesArg = integer; // type of {m,n} arguments
PREBracesArg = ^TREBracesArg;
const
REOpSz = SizeOf (TREOp) div SizeOf (REChar); // size of p-code in RegExprString units
RENextOffSz = SizeOf (TRENextOff) div SizeOf (REChar); // size of Next 'pointer' -"-
REBracesArgSz = SizeOf (TREBracesArg) div SizeOf (REChar); // size of BRACES arguments -"-
type
TRegExprInvertCaseFunction = function (const Ch : REChar) : REChar
of object;
const
RegExprModifierI : boolean = False; // default value for ModifierI
RegExprModifierR : boolean = True; // default value for ModifierR
RegExprModifierS : boolean = True; // default value for ModifierS
RegExprModifierG : boolean = True; // default value for ModifierG
RegExprModifierM : boolean = False; // default value for ModifierM
RegExprModifierX : boolean = False; // default value for ModifierX
RegExprSpaceChars : RegExprString = // default value for SpaceChars
' '#$9#$A#$D#$C;
RegExprWordChars : RegExprString = // default value for WordChars
'0123456789' //###0.940
+ 'abcdefghijklmnopqrstuvwxyz'
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
RegExprLineSeparators : RegExprString =// default value for LineSeparators
#$d#$a{$IFDEF UniCode}+#$b#$c#$2028#$2029#$85{$ENDIF}; //###0.947
RegExprLinePairedSeparator : RegExprString =// default value for LinePairedSeparator
#$d#$a;
{ if You need Unix-styled line separators (only \n), then use:
RegExprLineSeparators = #$a;
RegExprLinePairedSeparator = '';
}
const
NSUBEXP = 15; // max number of subexpression //###0.929
// Cannot be more than NSUBEXPMAX
// Be carefull - don't use values which overflow CLOSE opcode
// (in this case you'll get compiler erorr).
// Big NSUBEXP will cause more slow work and more stack required
NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
// Don't change it! It's defined by internal TRegExpr design.
MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
{$IFDEF ComplexBraces}
LoopStackMax = 10; // max depth of loops stack //###0.925
{$ENDIF}
TinySetLen = 3;
// if range includes more then TinySetLen chars, //###0.934
// then use full (32 bytes) ANYOFFULL instead of ANYOF[BUT]TINYSET
// !!! Attension ! If you change TinySetLen, you must
// change code marked as "//!!!TinySet"
type
{$IFDEF UseSetOfChar}
PSetOfREChar = ^TSetOfREChar;
TSetOfREChar = set of REChar;
{$ENDIF}
TRegExpr = class
private
startp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr starting points
endp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr end points
{$IFDEF ComplexBraces}
LoopStack : array [1 .. LoopStackMax] of integer; // state before entering loop
LoopStackIdx : integer; // 0 - out of all loops
{$ENDIF}
// The "internal use only" fields to pass info from compile
// to execute that permits the execute phase to run lots faster on
// simple cases.
regstart : REChar; // char that must begin a match; '\0' if none obvious
reganch : REChar; // is the match anchored (at beginning-of-line only)?
regmust : PRegExprChar; // string (pointer into program) that match must include, or nil
regmlen : integer; // length of regmust string
// Regstart and reganch permit very fast decisions on suitable starting points
// for a match, cutting down the work a lot. Regmust permits fast rejection
// of lines that cannot possibly match. The regmust tests are costly enough
// that regcomp() supplies a regmust only if the r.e. contains something
// potentially expensive (at present, the only such thing detected is * or +
// at the start of the r.e., which can involve a lot of backup). Regmlen is
// supplied because the test in regexec() needs it and regcomp() is computing
// it anyway.
{$IFDEF UseFirstCharSet} //###0.929
FirstCharSet : TSetOfREChar;
{$ENDIF}
// work variables for Exec's routins - save stack in recursion}
reginput : PRegExprChar; // String-input pointer.
fInputStart : PRegExprChar; // Pointer to first char of input string.
fInputEnd : PRegExprChar; // Pointer to char AFTER last char of input string
// work variables for compiler's routines
regparse : PRegExprChar; // Input-scan pointer.
regnpar : integer; // count.
regdummy : char;
regcode : PRegExprChar; // Code-emit pointer; @regdummy = don't.
regsize : integer; // Code size.
regexpbeg : PRegExprChar; // only for error handling. Contains
// pointer to beginning of r.e. while compiling
fExprIsCompiled : boolean; // true if r.e. successfully compiled
// programm is essentially a linear encoding
// of a nondeterministic finite-state machine (aka syntax charts or
// "railroad normal form" in parsing technology). Each node is an opcode
// plus a "next" pointer, possibly plus an operand. "Next" pointers of
// all nodes except BRANCH implement concatenation; a "next" pointer with
// a BRANCH on both ends of it is connecting two alternatives. (Here we
// have one of the subtle syntax dependencies: an individual BRANCH (as
// opposed to a collection of them) is never concatenated with anything
// because of operator precedence.) The operand of some types of node is
// a literal string; for others, it is a node leading into a sub-FSM. In
// particular, the operand of a BRANCH node is the first node of the branch.
// (NB this is *not* a tree structure: the tail of the branch connects
// to the thing following the set of BRANCHes.) The opcodes are:
programm : PRegExprChar; // Unwarranted chumminess with compiler.
fExpression : PRegExprChar; // source of compiled r.e.
fInputString : PRegExprChar; // input string
fLastError : integer; // see Error, LastError
fModifiers : integer; // modifiers
fCompModifiers : integer; // compiler's copy of modifiers
fProgModifiers : integer; // modifiers values from last programm compilation
fSpaceChars : RegExprString; //###0.927
fWordChars : RegExprString; //###0.929
fInvertCase : TRegExprInvertCaseFunction; //###0.927
fLineSeparators : RegExprString; //###0.941
fLinePairedSeparatorAssigned : boolean;
fLinePairedSeparatorHead,
fLinePairedSeparatorTail : REChar;
{$IFNDEF UniCode}
fLineSeparatorsSet : set of REChar;
{$ENDIF}
procedure InvalidateProgramm;
// Mark programm as have to be [re]compiled
function IsProgrammOk : boolean; //###0.941
// Check if we can use precompiled r.e. or
// [re]compile it if something changed
function GetExpression : RegExprString;
procedure SetExpression (const s : RegExprString);
function GetModifierStr : RegExprString;
class function ParseModifiersStr (const AModifiers : RegExprString;
var AModifiersInt : integer) : boolean; //###0.941 class function now
// Parse AModifiers string and return true and set AModifiersInt
// if it's in format 'ismxrg-ismxrg'.
procedure SetModifierStr (const AModifiers : RegExprString);
function GetModifier (AIndex : integer) : boolean;
procedure SetModifier (AIndex : integer; ASet : boolean);
procedure Error (AErrorID : integer); virtual; // error handler.
// Default handler raise exception ERegExpr with
// Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
// and CompilerErrorPos = value of property CompilerErrorPos.
{==================== Compiler section ===================}
function CompileRegExpr (exp : PRegExprChar) : boolean;
// compile a regular expression into internal code
procedure Tail (p : PRegExprChar; val : PRegExprChar);
// set the next-pointer at the end of a node chain
procedure OpTail (p : PRegExprChar; val : PRegExprChar);
// regoptail - regtail on operand of first argument; nop if operandless
function EmitNode (op : TREOp) : PRegExprChar;
// regnode - emit a node, return location
procedure EmitC (b : REChar);
// emit (if appropriate) a byte of code
procedure InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer); //###0.90
// insert an operator in front of already-emitted operand
// Means relocating the operand.
function ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
// regular expression, i.e. main body or parenthesized thing
function ParseBranch (var flagp : integer) : PRegExprChar;
// one alternative of an | operator
function ParsePiece (var flagp : integer) : PRegExprChar;
// something followed by possible [*+?]
function ParseAtom (var flagp : integer) : PRegExprChar;
// the lowest level
function GetCompilerErrorPos : integer;
// current pos in r.e. - for error hanling
{$IFDEF UseFirstCharSet} //###0.929
procedure FillFirstCharSet (prog : PRegExprChar);
{$ENDIF}
{===================== Mathing section ===================}
function regrepeat (p : PRegExprChar; AMax : integer) : integer;
// repeatedly match something simple, report how many
function regnext (p : PRegExprChar) : PRegExprChar;
// dig the "next" pointer out of a node
function MatchPrim (prog : PRegExprChar) : boolean;
// recursively matching routine
function RegMatch (str : PRegExprChar) : boolean;
// try match at specific point, uses MatchPrim for real work
function ExecPrim (AOffset: integer) : boolean;
// Exec for stored InputString
{$IFDEF DebugRegExpr}
function DumpOp (op : REChar) : RegExprString;
{$ENDIF}
function GetSubExprMatchCount : integer;
function GetMatchPos (Idx : integer) : integer;
function GetMatchLen (Idx : integer) : integer;
function GetMatch (Idx : integer) : RegExprString;
function GetInputString : RegExprString;
procedure SetInputString (const AInputString : RegExprString);
{$IFNDEF UseSetOfChar}
function StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928
{$ENDIF}
procedure SetLineSeparators (const AStr : RegExprString);
procedure SetLinePairedSeparator (const AStr : RegExprString);
function GetLinePairedSeparator : RegExprString;
public
constructor Create;
destructor Destroy; override;
class function VersionMajor : integer; //###0.944
class function VersionMinor : integer; //###0.944
property Expression : RegExprString read GetExpression write SetExpression;
// Regular expression.
// For optimization, TRegExpr will automatically compiles it into 'P-code'
// (You can see it with help of Dump method) and stores in internal
// structures. Real [re]compilation occures only when it really needed -
// while calling Exec[Next], Substitute, Dump, etc
// and only if Expression or other P-code affected properties was changed
// after last [re]compilation.
// If any errors while [re]compilation occures, Error method is called
// (by default Error raises exception - see below)
property ModifierStr : RegExprString read GetModifierStr write SetModifierStr;
// Set/get default values of r.e.syntax modifiers. Modifiers in
// r.e. (?ismx-ismx) will replace this default values.
// If you try to set unsupported modifier, Error will be called
// (by defaul Error raises exception ERegExpr).
property ModifierI : boolean index 1 read GetModifier write SetModifier;
// Modifier /i - caseinsensitive, initialized from RegExprModifierI
property ModifierR : boolean index 2 read GetModifier write SetModifier;
// Modifier /r - use r.e.syntax extended for russian,
// (was property ExtSyntaxEnabled in previous versions)
// If true, then à-ÿ additional include russian letter '¸',
// À-ß additional include '¨', and à-ß include all russian symbols.
// You have to turn it off if it may interfere with you national alphabet.
// , initialized from RegExprModifierR
property ModifierS : boolean index 3 read GetModifier write SetModifier;
// Modifier /s - '.' works as any char (else as [^\n]),
// , initialized from RegExprModifierS
property ModifierG : boolean index 4 read GetModifier write SetModifier;
// Switching off modifier /g switchs all operators in
// non-greedy style, so if ModifierG = False, then
// all '*' works as '*?', all '+' as '+?' and so on.
// , initialized from RegExprModifierG
property ModifierM : boolean index 5 read GetModifier write SetModifier;
// Treat string as multiple lines. That is, change `^' and `$' from
// matching at only the very start or end of the string to the start
// or end of any line anywhere within the string.
// , initialized from RegExprModifierM
property ModifierX : boolean index 6 read GetModifier write SetModifier;
// Modifier /x - eXtended syntax, allow r.e. text formatting,
// see description in the help. Initialized from RegExprModifierX
function Exec (const AInputString : RegExprString) : boolean;
// match a programm against a string AInputString
// !!! Exec store AInputString into InputString property
function ExecNext : boolean;
// find next match:
// Exec (AString); ExecNext;
// works same as
// Exec (AString);
// if MatchLen [0] = 0 then ExecPos (MatchPos [0] + 1)
// else ExecPos (MatchPos [0] + MatchLen [0]);
// but it's more simpler !
function ExecPos (AOffset: integer {$IFDEF D4_}= 1{$ENDIF}) : boolean;
// find match for InputString starting from AOffset position
// (AOffset=1 - first char of InputString)
property InputString : RegExprString read GetInputString write SetInputString;
// returns current input string (from last Exec call or last assign
// to this property).
// Any assignment to this property clear Match* properties !
function Substitute (const ATemplate : RegExprString) : RegExprString;
// Returns ATemplate with '$&' or '$0' replaced by whole r.e.
// occurence and '$n' replaced by occurence of subexpression #n.
// Since v.0.929 '$' used instead of '\' (for future extensions
// and for more Perl-compatibility) and accept more then one digit.
// If you want place into template raw '$' or '\', use prefix '\'
// Example: '1\$ is $2\\rub\\' -> '1$ is <Match[2]>\rub\'
// If you want to place raw digit after '$n' you must delimit
// n with curly braces '{}'.
// Example: 'a$12bc' -> 'a<Match[12]>bc'
// 'a${1}2bc' -> 'a<Match[1]>2bc'.
procedure Split (AInputStr : RegExprString; APieces : TStrings);
// Split AInputStr into APieces by r.e. occurencies
// Internally calls Exec[Next]
function Replace (AInputStr : RegExprString;
const AReplaceStr : RegExprString;
AUseSubstitution : boolean{$IFDEF D4_}= False{$ENDIF}) //###0.946
: RegExprString;
// Returns AInputStr with r.e. occurencies replaced by AReplaceStr
// If AUseSubstitution is true, then AReplaceStr will be used
// as template for Substitution methods.
// For example:
// Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*';
// Replace ('BLOCK( test1)', 'def "$1" value "$2"', True);
// will return: def 'BLOCK' value 'test1'
// Replace ('BLOCK( test1)', 'def "$1" value "$2"')
// will return: def "$1" value "$2"
// Internally calls Exec[Next]
property SubExprMatchCount : integer read GetSubExprMatchCount;
// Number of subexpressions has been found in last Exec* call.
// If there are no subexpr. but whole expr was found (Exec* returned True),
// then SubExprMatchCount=0, if no subexpressions nor whole
// r.e. found (Exec* returned false) then SubExprMatchCount=-1.
// Note, that some subexpr. may be not found and for such
// subexpr. MathPos=MatchLen=-1 and Match=''.
// For example: Expression := '(1)?2(3)?';
// Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
// Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
// Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
// Exec ('2'): SubExprMatchCount=0, Match[0]='2'
// Exec ('7') - return False: SubExprMatchCount=-1
property MatchPos [Idx : integer] : integer read GetMatchPos;
// pos of entrance subexpr. #Idx into tested in last Exec*
// string. First subexpr. have Idx=1, last - MatchCount,
// whole r.e. have Idx=0.
// Returns -1 if in r.e. no such subexpr. or this subexpr.
// not found in input string.
property MatchLen [Idx : integer] : integer read GetMatchLen;
// len of entrance subexpr. #Idx r.e. into tested in last Exec*
// string. First subexpr. have Idx=1, last - MatchCount,
// whole r.e. have Idx=0.
// Returns -1 if in r.e. no such subexpr. or this subexpr.
// not found in input string.
// Remember - MatchLen may be 0 (if r.e. match empty string) !
property Match [Idx : integer] : RegExprString read GetMatch;
// == copy (InputString, MatchPos [Idx], MatchLen [Idx])
// Returns '' if in r.e. no such subexpr. or this subexpr.
// not found in input string.
function LastError : integer;
// Returns ID of last error, 0 if no errors (unusable if
// Error method raises exception) and clear internal status
// into 0 (no errors).
function ErrorMsg (AErrorID : integer) : RegExprString; virtual;
// Returns Error message for error with ID = AErrorID.
property CompilerErrorPos : integer read GetCompilerErrorPos;
// Returns pos in r.e. there compiler stopped.
// Usefull for error diagnostics
property SpaceChars : RegExprString read fSpaceChars write fSpaceChars; //###0.927
// Contains chars, treated as /s (initially filled with RegExprSpaceChars
// global constant)
property WordChars : RegExprString read fWordChars write fWordChars; //###0.929
// Contains chars, treated as /w (initially filled with RegExprWordChars
// global constant)
property LineSeparators : RegExprString read fLineSeparators write SetLineSeparators; //###0.941
// line separators (like \n in Unix)
property LinePairedSeparator : RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; //###0.941
// paired line separator (like \r\n in DOS and Windows).
// must contain exactly two chars or no chars at all
class function InvertCaseFunction (const Ch : REChar) : REChar;
// Converts Ch into upper case if it in lower case or in lower
// if it in upper (uses current system local setings)
property InvertCase : TRegExprInvertCaseFunction read fInvertCase write fInvertCase; //##0.935
// Set this property if you want to override case-insensitive functionality.
// Create set it to RegExprInvertCaseFunction (InvertCaseFunction by default)
procedure Compile; //###0.941
// [Re]compile r.e. Usefull for example for GUI r.e. editors (to check
// all properties validity).
{$IFDEF DebugRegExpr}
function Dump : RegExprString;
// dump a compiled regexp in vaguely comprehensible form
{$ENDIF}
end;
ERegExpr = class (Exception)
public
ErrorCode : integer;
CompilerErrorPos : integer;
end;
const
RegExprInvertCaseFunction : TRegExprInvertCaseFunction = TRegExpr.InvertCaseFunction;
// defaul for InvertCase property
function ExecRegExpr (const ARegExpr, AInputStr : RegExprString) : boolean;
// true if string AInputString match regular expression ARegExpr
// ! will raise exeption if syntax errors in ARegExpr
procedure SplitRegExpr (const ARegExpr, AInputStr : RegExprString; APieces : TStrings);
// Split AInputStr into APieces by r.e. ARegExpr occurencies
function ReplaceRegExpr (const ARegExpr, AInputStr, AReplaceStr : RegExprString;
AUseSubstitution : boolean{$IFDEF D4_}= False{$ENDIF}) : RegExprString; //###0.947
// Returns AInputStr with r.e. occurencies replaced by AReplaceStr
// If AUseSubstitution is true, then AReplaceStr will be used
// as template for Substitution methods.
// For example:
// ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
// 'BLOCK( test1)', 'def "$1" value "$2"', True)
// will return: def 'BLOCK' value 'test1'
// ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
// 'BLOCK( test1)', 'def "$1" value "$2"')
// will return: def "$1" value "$2"
function QuoteRegExprMetaChars (const AStr : RegExprString) : RegExprString;
// Replace all metachars with its safe representation,
// for example 'abc$cd.(' converts into 'abc\$cd\.\('
// This function usefull for r.e. autogeneration from
// user input
function RegExprSubExpressions (const ARegExpr : string;
ASubExprs : TStrings; AExtendedSyntax : boolean{$IFDEF D4_}= False{$ENDIF}) : integer;
// Makes list of subexpressions found in ARegExpr r.e.
// In ASubExps every item represent subexpression,
// from first to last, in format:
// String - subexpression text (without '()')
// low word of Object - starting position in ARegExpr, including '('
// if exists! (first position is 1)
// high word of Object - length, including starting '(' and ending ')'
// if exist!
// AExtendedSyntax - must be True if modifier /m will be On while
// using the r.e.
// Usefull for GUI editors of r.e. etc (You can find example of using
// in TestRExp.dpr project)
// Returns
// 0 Success. No unbalanced brackets was found;
// -1 There are not enough closing brackets ')';
// -(n+1) At position n was found opening '[' without //###0.942
// corresponding closing ']';
// n At position n was found closing bracket ')' without
// corresponding opening '('.
// If Result <> 0, then ASubExpr can contain empty items or illegal ones
implementation
uses
Windows; // CharUpper/Lower
const
TRegExprVersionMajor : integer = 0;
TRegExprVersionMinor : integer = 947;
// don't use this const directly, use TRegExpr.VersionXXX instead
MaskModI = 1; // modifier /i bit in fModifiers
MaskModR = 2; // -"- /r
MaskModS = 4; // -"- /s
MaskModG = 8; // -"- /g
MaskModM = 16; // -"- /m
MaskModX = 32; // -"- /x
{$IFDEF UniCode}
XIgnoredChars = ' '#9#$d#$a;
{$ELSE}
XIgnoredChars = [' ', #9, #$d, #$a];
{$ENDIF}
{=============================================================}
{=================== WideString functions ====================}
{=============================================================}