diff --git a/embed.fnc b/embed.fnc index 45c6fd2b2648..fc09075bdaa1 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2029,9 +2029,12 @@ ERS |SV* |make_exactf_invlist |NN RExC_state_t *pRExC_state \ ES |regnode_offset|reg |NN RExC_state_t *pRExC_state \ |I32 paren|NN I32 *flagp|U32 depth ES |regnode_offset|regnode_guts|NN RExC_state_t *pRExC_state \ - |const U8 op \ - |const STRLEN extra_len \ - |NN const char* const name + |const STRLEN extra_len +#ifdef DEBUGGING +ES |regnode_offset|regnode_guts_debug|NN RExC_state_t *pRExC_state \ + |const U8 op \ + |const STRLEN extra_len +#endif ES |void |change_engine_size|NN RExC_state_t *pRExC_state|const Ptrdiff_t size ES |regnode_offset|reganode|NN RExC_state_t *pRExC_state|U8 op \ |U32 arg @@ -2108,13 +2111,12 @@ ES |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \ |NN SV** invlist ES |regnode_offset|handle_named_backref|NN RExC_state_t *pRExC_state \ |NN I32 *flagp \ - |NN char * parse_start \ + |NN char * backref_parse_start \ |char ch ESTR |unsigned int|regex_set_precedence|const U8 my_operator ES |regnode_offset|handle_regex_sets|NN RExC_state_t *pRExC_state \ |NULLOK SV ** return_invlist \ - |NN I32 *flagp|U32 depth \ - |NN char * const oregcomp_parse + |NN I32 *flagp|U32 depth ES |void |set_regex_pv |NN RExC_state_t *pRExC_state|NN REGEXP *Rx # if defined(DEBUGGING) && defined(ENABLE_REGEX_SETS_DEBUGGING) ES |void |dump_regex_sets_structures \ diff --git a/embed.h b/embed.h index c08806128c94..691228bbe646 100644 --- a/embed.h +++ b/embed.h @@ -1030,6 +1030,7 @@ #endif #define regdump_extflags(a,b) S_regdump_extflags(aTHX_ a,b) #define regdump_intflags(a,b) S_regdump_intflags(aTHX_ a,b) +#define regnode_guts_debug(a,b,c) S_regnode_guts_debug(aTHX_ a,b,c) #define regtail_study(a,b,c,d) S_regtail_study(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_REGEXEC_C) @@ -1077,7 +1078,7 @@ #define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d) #define handle_names_wildcard(a,b,c,d) S_handle_names_wildcard(aTHX_ a,b,c,d) #define handle_possible_posix(a,b,c,d,e) S_handle_possible_posix(aTHX_ a,b,c,d,e) -#define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e) +#define handle_regex_sets(a,b,c,d) S_handle_regex_sets(aTHX_ a,b,c,d) #define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j) #define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b) #define invlist_is_iterating S_invlist_is_iterating @@ -1104,7 +1105,7 @@ #define regclass(a,b,c,d,e,f,g,h,i) S_regclass(aTHX_ a,b,c,d,e,f,g,h,i) #define regex_set_precedence S_regex_set_precedence #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) -#define regnode_guts(a,b,c,d) S_regnode_guts(aTHX_ a,b,c,d) +#define regnode_guts(a,b) S_regnode_guts(aTHX_ a,b) #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) #define regpnode(a,b,c) S_regpnode(aTHX_ a,b,c) #define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d) diff --git a/ext/re/re.pm b/ext/re/re.pm index d1db4625c006..791d680d5771 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -4,7 +4,7 @@ package re; use strict; use warnings; -our $VERSION = "0.41"; +our $VERSION = "0.42"; our @ISA = qw(Exporter); our @EXPORT_OK = qw{ is_regexp regexp_pattern @@ -71,8 +71,6 @@ my %flags = ( EXTRA => 0x3FF0000, TRIEM => 0x0010000, - OFFSETS => 0x0020000, - OFFSETSDBG => 0x0040000, STATE => 0x0080000, OPTIMISEM => 0x0100000, STACK => 0x0280000, @@ -81,9 +79,7 @@ my %flags = ( DUMP_PRE_OPTIMIZE => 0x1000000, WILDCARD => 0x2000000, ); -$flags{ALL} = -1 & ~($flags{OFFSETS} - |$flags{OFFSETSDBG} - |$flags{BUFFERS} +$flags{ALL} = -1 & ~($flags{BUFFERS} |$flags{DUMP_PRE_OPTIMIZE} |$flags{WILDCARD} ); @@ -626,26 +622,6 @@ Enable debugging of the \G modifier. Enable enhanced optimisation debugging and start-point optimisations. Probably not useful except when debugging the regexp engine itself. -=item OFFSETS - -Dump offset information. This can be used to see how regops correlate -to the pattern. Output format is - - NODENUM:POSITION[LENGTH] - -Where 1 is the position of the first char in the string. Note that position -can be 0, or larger than the actual length of the pattern, likewise length -can be zero. - -=item OFFSETSDBG - -Enable debugging of offsets information. This emits copious -amounts of trace information and doesn't mesh well with other -debug options. - -Almost definitely only useful to people hacking -on the offsets part of the debug engine. - =item DUMP_PRE_OPTIMIZE Enable the dumping of the compiled pattern before the optimization phase. @@ -687,8 +663,7 @@ These are useful shortcuts to save on the typing. =item ALL -Enable all options at once except OFFSETS, OFFSETSDBG, BUFFERS, WILDCARD, and -DUMP_PRE_OPTIMIZE. +Enable all options at once except BUFFERS, WILDCARD, and DUMP_PRE_OPTIMIZE. (To get every single option without exception, use both ALL and EXTRA, or starting in 5.30 on a C<-DDEBUGGING>-enabled perl interpreter, use the B<-Drv> command-line switches.) diff --git a/ext/re/t/regop.pl b/ext/re/t/regop.pl index 86976ee0da38..c725b73a9e6a 100644 --- a/ext/re/t/regop.pl +++ b/ext/re/t/regop.pl @@ -1,4 +1,4 @@ -use re Debug=>qw(DUMP EXECUTE OFFSETS TRIEC TEST); +use re Debug=>qw(DUMP EXECUTE TRIEC TEST); my @tests=( XY => 'X(A|[B]Q||C|D)Y' , foobar => '[f][o][o][b][a][r]', diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t index cf35d71fb090..20e9586c3329 100644 --- a/ext/re/t/regop.t +++ b/ext/re/t/regop.t @@ -140,7 +140,6 @@ Freeing REx: "[f][o][o][b][a][r]" minlen 3 --- # Compiling REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" -# Got 164 bytes for offset annotations. # TRIE(NATIVE): W:6 C:24 Uq:7 Min:4 Max:4 # Char : Match Base Ofs A B C P G E D # State|--------------------------------------------------- @@ -166,8 +165,6 @@ minlen 3 # # 20: END (0) # anchored "ABC" at 0 (checking anchored) minlen 4 -# Offsets: [20] -# 1:4[3] 3:4[15] 19:32[0] 20:34[0] # Guessing start of match in sv for REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" against "ABCD" # Found anchored substr "ABC" at offset 0... # Guessed: match at offset 0 @@ -210,8 +207,6 @@ anchored "ABC" at 0 # 47: EOL(48) # 48: END(0) #floating ""$ at 3..4 (checking floating) stclass "EXACTF <.>" minlen 3 -#Offsets: [48] -# 1:1[1] 3:2[1] 5:2[81] 45:83[1] 47:84[1] 48:85[0] #Guessing start of match, REx "(\.COM|\.EXE|\.BAT|\.CMD|\.VBS|\.VBE|\.JS|\.JSE|\.WSF|\.WSH|..." against "D:dev/perl/ver/28321_/perl.exe"... #Found floating substr ""$ at offset 30... #Starting position does not contradict /^/m... @@ -233,7 +228,6 @@ anchored "ABC" at 0 #Freeing REx: "(\\.COM|\\.EXE|\\.BAT|\\.CMD|\\.VBS|\\.VBE|\\.JS|\\.JSE|\\."...... %MATCHED% floating ""$ at 3..4 (checking floating) -#1:1[1] 3:2[1] 5:2[64] 45:83[1] 47:84[1] 48:85[0] #stclass EXACTF <.> minlen 3 #Found floating substr ""$ at offset 30... #Does not contradict STCLASS... @@ -241,22 +235,16 @@ floating ""$ at 3..4 (checking floating) #Matching stclass EXACTF <.> against ".exe" --- #Compiling REx "[q]" -#size 3 nodes Got 7 bytes for offset annotations. #first at 1 #Final program: # 1: EXACT (3) # 3: END(0) #anchored "q" at 0 (checking anchored isall) minlen 1 -#Offsets: [3] -# 1:1[3] 3:4[0] #Guessing start of match, REx "[q]" against "q"... #Found anchored substr "q" at offset 0... #Guessed: match at offset 0 #%MATCHED% #Freeing REx: "[q]" -Got 7 bytes for offset annotations. -Offsets: [3] -1:1[3] 3:4[0] %MATCHED% Freeing REx: "[q]" --- @@ -281,7 +269,6 @@ Freeing REx: "[q]" Freeing REx: "^(\S{1,9}):\s*(\d+)$" --- #Compiling REx "(?(DEFINE)(?foo))(?(DEFINE)(?(?&foo)bar))(?(DEFINE"... -#Got 532 bytes for offset annotations. study_chunk_recursed_count: 5 #Final program: # 1: DEFINEP (3) @@ -317,8 +304,6 @@ study_chunk_recursed_count: 5 # 61: TAIL (62) # 62: END (0) minlen 0 -#Offsets: [66] -# 1:3[0] 3:10[0] 5:17[1] 7:18[3] 9:21[1] 11:21[0] 13:22[0] 14:25[0] 16:32[0] 18:39[1] 20:41[3] 23:47[3] 25:50[1] 27:50[0] 29:51[0] 30:54[0] 32:61[0] 34:68[1] 36:70[3] 39:76[3] 41:79[1] 43:79[0] 45:80[0] 46:83[0] 48:90[0] 50:97[1] 52:99[3] 55:105[3] 57:108[1] 59:108[0] 61:109[0] 62:110[0] #Matching REx "(?(DEFINE)(?foo))(?(DEFINE)(?(?&foo)bar))(?(DEFINE"... against "" # 0 <> <> | 1:DEFINEP(3) # 0 <> <> | 3:IFTHEN(14) diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod index b0a8d8f92234..e19f157058ba 100644 --- a/pod/perlreguts.pod +++ b/pod/perlreguts.pod @@ -827,29 +827,18 @@ The following structure is used as the C struct by perl's regex engine. Since it is specific to perl it is only of curiosity value to other engine implementations. - typedef struct regexp_internal { - U32 *offsets; /* offset annotations 20001228 MJD - * data about mapping the program to - * the string*/ - regnode *regstclass; /* Optional startclass as identified or - * constructed by the optimiser */ - struct reg_data *data; /* Additional miscellaneous data used - * by the program. Used to make it - * easier to clone and free arbitrary - * data that the regops need. Often the - * ARG field of a regop is an index - * into this structure */ - regnode program[1]; /* Unwarranted chumminess with - * compiler. */ - } regexp_internal; + typedef struct regexp_internal { + regnode *regstclass; + struct reg_data *data; + struct reg_code_blocks *code_blocks; + U32 proglen; + U32 name_list_idx; + regnode program[1]; + } regexp_internal; -=over 5 - -=item C +Description of the attributes is as follows: -Offsets holds a mapping of offset in the C -to offset in the C string. This is only used by ActiveState's -visual regex debugger. +=over 5 =item C @@ -878,6 +867,42 @@ what array. During compilation regops that need special structures stored will add an element to each array using the add_data() routine and then store the index in the regop. +In modern perls the 0th element of this structure is reserved and is NEVER +used to store anything of use. This is to allow things that need to index +into this array to represent "no value". + +=item C + +This optional structure is used to manage C<(?{})> constructs in the +pattern. It is made up of the following structures. + + /* record the position of a (?{...}) within a pattern */ + struct reg_code_block { + STRLEN start; + STRLEN end; + OP *block; + REGEXP *src_regex; + }; + + /* array of reg_code_block's plus header info */ + struct reg_code_blocks { + int refcnt; /* we may be pointed to from a regex + and from the savestack */ + int count; /* how many code blocks */ + struct reg_code_block *cb; /* array of reg_code_block's */ + }; + +=item C + +Stores the length of the compiled program in units of regops. + +=item C + +This is the index into the data array where an AV is stored that contains +the names of any named capture buffers in the pattern, should there be +any. This is only used in the debugging version of the regex engine and +when RXp_PAREN_NAMES(prog) is true. It will be 0 if there is no such data. + =item C Compiled program. Inlined into the structure so the entire struct can be diff --git a/proto.h b/proto.h index 2df7f0a30fc5..111901d56667 100644 --- a/proto.h +++ b/proto.h @@ -4720,6 +4720,9 @@ STATIC void S_regdump_extflags(pTHX_ const char *lead, const U32 flags); #define PERL_ARGS_ASSERT_REGDUMP_EXTFLAGS STATIC void S_regdump_intflags(pTHX_ const char *lead, const U32 flags); #define PERL_ARGS_ASSERT_REGDUMP_INTFLAGS +STATIC regnode_offset S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_len); +#define PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG \ + assert(pRExC_state) STATIC bool S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, const regnode_offset val, U32 depth) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_REGTAIL_STUDY \ @@ -5877,18 +5880,18 @@ STATIC U32 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state, const char * STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode_offset* nodep, UV *code_point_p, int* cp_count, I32 *flagp, const bool strict, const U32 depth); #define PERL_ARGS_ASSERT_GROK_BSLASH_N \ assert(pRExC_state); assert(flagp) -STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * parse_start, char ch); +STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * backref_parse_start, char ch); #define PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF \ - assert(pRExC_state); assert(flagp); assert(parse_start) + assert(pRExC_state); assert(flagp); assert(backref_parse_start) STATIC bool S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition, AV ** strings); #define PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD \ assert(wname); assert(prop_definition); assert(strings) STATIC int S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char* const s, char ** updated_parse_ptr, AV** posix_warnings, const bool check_only); #define PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX \ assert(pRExC_state); assert(s) -STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth, char * const oregcomp_parse); +STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth); #define PERL_ARGS_ASSERT_HANDLE_REGEX_SETS \ - assert(pRExC_state); assert(flagp); assert(oregcomp_parse) + assert(pRExC_state); assert(flagp) STATIC SV * S_handle_user_defined_property(pTHX_ const char * name, const STRLEN name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, SV* contents, bool *user_defined_ptr, SV * msg, const STRLEN level); #define PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY \ assert(name); assert(contents); assert(user_defined_ptr); assert(msg) @@ -5990,9 +5993,9 @@ STATIC unsigned int S_regex_set_precedence(const U8 my_operator) STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, const regnode_offset operand, const U32 depth); #define PERL_ARGS_ASSERT_REGINSERT \ assert(pRExC_state) -STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_len, const char* const name); +STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_len); #define PERL_ARGS_ASSERT_REGNODE_GUTS \ - assert(pRExC_state); assert(name) + assert(pRExC_state) STATIC regnode_offset S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth); #define PERL_ARGS_ASSERT_REGPIECE \ assert(pRExC_state); assert(flagp) diff --git a/regcomp.c b/regcomp.c index cec3194efb7b..462fe2b08fdf 100644 --- a/regcomp.c +++ b/regcomp.c @@ -294,10 +294,6 @@ struct RExC_state_t { #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs under /d from /u ? */ -#ifdef RE_TRACK_PATTERN_OFFSETS -# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the - others */ -#endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_start (pRExC_state->emit_start) #define RExC_sawback (pRExC_state->sawback) @@ -1058,70 +1054,8 @@ static const scan_data_t zero_scan_data = { #define REGNODE_p(offset) (RExC_emit_start + (offset)) #define REGNODE_OFFSET(node) ((node) - RExC_emit_start) -/* Macros for recording node offsets. 20001227 mjd@plover.com - * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in - * element 2*n-1 of the array. Element #2n holds the byte length node #n. - * Element 0 holds the number n. - * Position is 1 indexed. - */ -#ifndef RE_TRACK_PATTERN_OFFSETS -#define Set_Node_Offset_To_R(offset,byte) -#define Set_Node_Offset(node,byte) -#define Set_Cur_Node_Offset -#define Set_Node_Length_To_R(node,len) -#define Set_Node_Length(node,len) -#define Set_Node_Cur_Length(node,start) -#define Node_Offset(n) -#define Node_Length(n) -#define Set_Node_Offset_Length(node,offset,len) -#define ProgLen(ri) ri->u.proglen -#define SetProgLen(ri,x) ri->u.proglen = x -#define Track_Code(code) -#else -#define ProgLen(ri) ri->u.offsets[0] -#define SetProgLen(ri,x) ri->u.offsets[0] = x -#define Set_Node_Offset_To_R(offset,byte) STMT_START { \ - MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ - __LINE__, (int)(offset), (int)(byte))); \ - if((offset) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Offset macro", \ - (int)(offset)); \ - } else { \ - RExC_offsets[2*(offset)-1] = (byte); \ - } \ -} STMT_END - -#define Set_Node_Offset(node,byte) \ - Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start) -#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) - -#define Set_Node_Length_To_R(node,len) STMT_START { \ - MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ - __LINE__, (int)(node), (int)(len))); \ - if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Length macro", \ - (int)(node)); \ - } else { \ - RExC_offsets[2*(node)] = (len); \ - } \ -} STMT_END - -#define Set_Node_Length(node,len) \ - Set_Node_Length_To_R(REGNODE_OFFSET(node), len) -#define Set_Node_Cur_Length(node, start) \ - Set_Node_Length(node, RExC_parse - start) - -/* Get offsets and lengths */ -#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1]) -#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))]) - -#define Set_Node_Offset_Length(node,offset,len) STMT_START { \ - Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \ - Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \ -} STMT_END - -#define Track_Code(code) STMT_START { code } STMT_END -#endif +#define ProgLen(ri) ri->proglen +#define SetProgLen(ri,x) ri->proglen = x #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS #define EXPERIMENTAL_INPLACESCAN @@ -3516,11 +3450,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, #ifdef DEBUGGING regnode *optimize = NULL; -#ifdef RE_TRACK_PATTERN_OFFSETS - - U32 mjd_offset = 0; - U32 mjd_nodelen = 0; -#endif /* RE_TRACK_PATTERN_OFFSETS */ #endif /* DEBUGGING */ /* This means we convert either the first branch or the first Exact, @@ -3534,28 +3463,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( first != startbranch || OP( last ) == BRANCH ) { /* branch sub-chain */ NEXT_OFF( first ) = (U16)(last - first); -#ifdef RE_TRACK_PATTERN_OFFSETS - DEBUG_r({ - mjd_offset= Node_Offset((convert)); - mjd_nodelen= Node_Length((convert)); - }); -#endif /* whole branch chain */ } -#ifdef RE_TRACK_PATTERN_OFFSETS - else { - DEBUG_r({ - const regnode *nop = NEXTOPER( convert ); - mjd_offset= Node_Offset((nop)); - mjd_nodelen= Node_Length((nop)); - }); - } - DEBUG_OPTIMISE_r( - Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n", - depth+1, - (UV)mjd_offset, (UV)mjd_nodelen) - ); -#endif /* But first we check to see if there is a common prefix we can split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; @@ -3673,15 +3582,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, DEBUG_r_TEST #endif ) { - regnode *fix = convert; U32 word = trie->wordcount; -#ifdef RE_TRACK_PATTERN_OFFSETS - mjd_nodelen++; -#endif - Set_Node_Offset_Length(convert, mjd_offset, state - 1); - while( ++fix < n ) { - Set_Node_Offset_Length(fix, 0, 0); - } while (word--) { SV ** const tmp = av_fetch( trie_words, word, 0 ); if (tmp) { @@ -3741,22 +3642,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } /* needed for dumping*/ DEBUG_r(if (optimize) { - regnode *opt = convert; - - while ( ++opt < optimize) { - Set_Node_Offset_Length(opt, 0, 0); - } /* Try to clean up some of the debris left after the optimisation. */ while( optimize < jumper ) { - Track_Code( mjd_nodelen += Node_Length((optimize)); ); OP( optimize ) = OPTIMIZED; - Set_Node_Offset_Length(optimize, 0, 0); optimize++; } - Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen); }); } /* end node insert */ @@ -8012,28 +7905,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_lastparse=NULL; }); -#ifdef RE_TRACK_PATTERN_OFFSETS - DEBUG_OFFSETS_r(Perl_re_printf( aTHX_ - "%s %" UVuf " bytes for offset annotations.\n", - RExC_offsets ? "Got" : "Couldn't get", - (UV)((RExC_offsets[0] * 2 + 1)))); - DEBUG_OFFSETS_r(if (RExC_offsets) { - const STRLEN len = RExC_offsets[0]; - STRLEN i; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - Perl_re_printf( aTHX_ - "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]); - for (i = 1; i <= len; i++) { - if (RExC_offsets[i*2-1] || RExC_offsets[i*2]) - Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ", - (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]); - } - Perl_re_printf( aTHX_ "\n"); - }); - -#else SetProgLen(RExC_rxi,RExC_size); -#endif DEBUG_DUMP_PRE_OPTIMIZE_r({ SV * const sv = sv_newmortal(); @@ -11122,7 +10994,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, - char * parse_start, + char * backref_parse_start, char ch ) { @@ -11141,7 +11013,7 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, } if (RExC_parse == name_start || *RExC_parse != ch) { /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ - vFAIL2("Sequence %.3s... not terminated", parse_start); + vFAIL2("Sequence %.3s... not terminated", backref_parse_start); } if (sv_dat) { @@ -11163,9 +11035,6 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, num); *flagp |= HASWIDTH; - Set_Node_Offset(REGNODE_p(ret), parse_start+1); - Set_Node_Cur_Length(REGNODE_p(ret), parse_start); - nextchar(pRExC_state); return ret; } @@ -11246,8 +11115,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) SV * max_open; /* Max number of unclosed parens */ I32 was_in_lookaround = RExC_in_lookaround; - char * parse_start = RExC_parse; /* MJD */ - char * const oregcomp_parse = RExC_parse; + /* The difference between the following variables can be seen with * + * the broken pattern /(?:foo/ where segment_parse_start will point * + * at the 'f', and reg_parse_start will point at the '(' */ + + /* the following is used for unmatched '(' errors */ + char * const reg_parse_start = RExC_parse; + + /* the following is used to track where various segments of + * the pattern that we parse out started. */ + char * segment_parse_start = RExC_parse; DECLARE_AND_GET_RE_DEBUG_FLAGS; @@ -11632,7 +11509,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) else if (paren == '=') { /* (?P=...) named backref */ RExC_parse++; return handle_named_backref(pRExC_state, flagp, - parse_start, ')'); + segment_parse_start, ')'); } RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ @@ -11783,7 +11660,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /*notreached*/ /* named and numeric backreferences */ case '&': /* (?&NAME) */ - parse_start = RExC_parse - 1; + segment_parse_start = RExC_parse - 1; named_recursion: { SV *sv_dat = reg_scan_name(pRExC_state, @@ -11814,7 +11691,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) { bool is_neg = FALSE; UV unum; - parse_start = RExC_parse - 1; /* MJD */ + segment_parse_start = RExC_parse - 1; if (*RExC_parse == '-') { RExC_parse++; is_neg = TRUE; @@ -11913,10 +11790,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) (IV)ARG2L(REGNODE_p(ret)))); RExC_seen |= REG_RECURSE_SEEN; - Set_Node_Length(REGNODE_p(ret), - 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */ - Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */ - *flagp |= POSTPONED; assert(*RExC_parse == ')'); nextchar(pRExC_state); @@ -11990,12 +11863,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (! REGTAIL(pRExC_state, ret, eval)) { REQUIRE_BRANCHJ(flagp, 0); } - /* deal with the length of this later - MJD */ return ret; } ret = reg2Lanode(pRExC_state, EVAL, n, 0); - Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); - Set_Node_Offset(REGNODE_p(ret), parse_start); return ret; } case '(': /* (?(?{...})...) and (?(?=...)...) */ @@ -12218,8 +12088,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL("Unknown switch condition (?(...))"); } case '[': /* (?[ ... ]) */ - return handle_regex_sets(pRExC_state, NULL, flagp, depth+1, - oregcomp_parse); + return handle_regex_sets(pRExC_state, NULL, flagp, depth+1); case 0: /* A NUL */ RExC_parse--; /* for vFAIL to print correctly */ vFAIL("Sequence (? incomplete"); @@ -12308,8 +12177,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) RExC_open_parens[parno]= ret; } - Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ - Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */ is_open = 1; } else { /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ @@ -12322,7 +12189,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) parse_rest: /* Pick up the branches, linking them together. */ - parse_start = RExC_parse; /* MJD */ + segment_parse_start = RExC_parse; br = regbranch(pRExC_state, &flags, 1, depth+1); /* branch_len = (paren != 0); */ @@ -12335,10 +12202,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (RExC_use_BRANCHJ) { reginsert(pRExC_state, BRANCHJ, br, depth+1); } - else { /* MJD */ + else { reginsert(pRExC_state, BRANCH, br, depth+1); - Set_Node_Length(REGNODE_p(br), paren != 0); - Set_Node_Offset_To_R(br, parse_start-RExC_start); } have_branch = 1; } @@ -12404,8 +12269,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (RExC_nestroot == parno) RExC_nestroot = 0; } - Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */ - Set_Node_Length(REGNODE_p(ender), 1); /* MJD */ break; case 's': ender = reg_node(pRExC_state, SRCLOSE); @@ -12534,8 +12397,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } reginsert(pRExC_state, node, ret, depth+1); - Set_Node_Cur_Length(REGNODE_p(ret), parse_start); - Set_Node_Offset(REGNODE_p(ret), parse_start + 1); FLAGS(REGNODE_p(ret)) = flag; if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL))) { @@ -12553,7 +12414,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); } if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { - RExC_parse = oregcomp_parse; + RExC_parse = reg_parse_start; vFAIL("Unmatched ("); } nextchar(pRExC_state); @@ -12608,7 +12469,6 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) ret = reganode(pRExC_state, BRANCHJ, 0); else { ret = reg_node(pRExC_state, BRANCH); - Set_Node_Length(REGNODE_p(ret), 1); } } @@ -12843,9 +12703,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) const char * const origparse = RExC_parse; I32 min; I32 max = REG_INFTY; -#ifdef RE_TRACK_PATTERN_OFFSETS - char *parse_start; -#endif /* Save the original in case we change the emitted regop to a FAIL. */ const regnode_offset orig_emit = RExC_emit; @@ -12862,10 +12719,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags); } -#ifdef RE_TRACK_PATTERN_OFFSETS - parse_start = RExC_parse; -#endif - op = *RExC_parse; switch (op) { const char * regcurly_return[5]; @@ -13007,8 +12860,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) MARK_NAUGHTY_EXP(2, 2); reginsert(pRExC_state, CURLY, ret, depth+1); - Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */ - Set_Node_Cur_Length(REGNODE_p(ret), parse_start); } else { /* not SIMPLE */ const regnode_offset w = reg_node(pRExC_state, WHILEM); @@ -13023,10 +12874,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */ } reginsert(pRExC_state, CURLYX, ret, depth+1); - /* MJD hk */ - Set_Node_Offset(REGNODE_p(ret), parse_start+1); - Set_Node_Length(REGNODE_p(ret), - op == '{' ? (RExC_parse - parse_start) : 1); if (RExC_use_BRANCHJ) NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to @@ -13230,7 +13077,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; MARK_NAUGHTY(1); - Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */ return TRUE; } @@ -13588,6 +13434,14 @@ S_backref_value(char *p, char *e) return I32_MAX; } +#ifdef DEBUGGING +#define REGNODE_GUTS(state,op,extra_size) \ + regnode_guts_debug(state,op,extra_size) +#else +#define REGNODE_GUTS(state,op,extra_size) \ + regnode_guts(state,extra_size) +#endif + /* - regatom - the lowest level @@ -13663,7 +13517,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { regnode_offset ret = 0; I32 flags = 0; - char *parse_start; + char *atom_parse_start; U8 op; int invert = 0; @@ -13676,7 +13530,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) PERL_ARGS_ASSERT_REGATOM; tryagain: - parse_start = RExC_parse; + atom_parse_start = RExC_parse; assert(RExC_parse < RExC_end); switch ((U8)*RExC_parse) { case '^': @@ -13686,7 +13540,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, MBOL); else ret = reg_node(pRExC_state, SBOL); - Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ break; case '$': nextchar(pRExC_state); @@ -13696,7 +13549,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, MEOL); else ret = reg_node(pRExC_state, SEOL); - Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ break; case '.': nextchar(pRExC_state); @@ -13706,11 +13558,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; MARK_NAUGHTY(1); - Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ break; case '[': { - char * const oregcomp_parse = ++RExC_parse; + char * const cc_parse_start = ++RExC_parse; ret = regclass(pRExC_state, flagp, depth+1, FALSE, /* means parse the whole char class */ TRUE, /* allow multi-char folds */ @@ -13724,11 +13575,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) (UV) *flagp); } if (*RExC_parse != ']') { - RExC_parse = oregcomp_parse; + RExC_parse = cc_parse_start; vFAIL("Unmatched ["); } nextchar(pRExC_state); - Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */ break; } case '(': @@ -14012,15 +13862,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* The escapes above that don't take a parameter can't be * followed by a '{'. But 'pX', 'p{foo}' and * correspondingly 'P' can be */ - if ( RExC_parse - parse_start == 1 + if ( RExC_parse - atom_parse_start == 1 && UCHARAT(RExC_parse + 1) == '{' && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL))) { RExC_parse += 2; vFAIL("Unescaped left brace in regex is illegal here"); } - Set_Node_Offset(REGNODE_p(ret), parse_start); - Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */ nextchar(pRExC_state); break; case 'N': @@ -14052,7 +13900,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RETURN_FAIL_ON_RESTART_FLAGP(flagp); /* Here, evaluates to a single code point. Go get that */ - RExC_parse = parse_start; + RExC_parse = atom_parse_start; goto defchar; case 'k': /* Handle \k and \k'NAME' and \k{NAME} */ @@ -14066,7 +13914,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { RExC_parse++; /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ - vFAIL2("Sequence %.2s... not terminated", parse_start); + vFAIL2("Sequence %.2s... not terminated", atom_parse_start); } else { RExC_parse += 2; if (ch == '{') { @@ -14076,7 +13924,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } ret = handle_named_backref(pRExC_state, flagp, - parse_start, + atom_parse_start, (ch == '<') ? '>' : (ch == '{') @@ -14187,7 +14035,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * to be an octal character escape, e.g. \35 or \777. * The above logic should make it obvious why using * octal escapes in patterns is problematic. - Yves */ - RExC_parse = parse_start; + RExC_parse = atom_parse_start; goto defchar; } } @@ -14238,9 +14086,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } *flagp |= HASWIDTH; - /* override incorrect value set in reganode MJD */ - Set_Node_Offset(REGNODE_p(ret), parse_start); - Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1); skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE /* Don't force to /x */ ); } @@ -14252,7 +14097,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) default: /* Do not generate "unrecognized" warnings here, we fall back into the quick-grab loop below */ - RExC_parse = parse_start; + RExC_parse = atom_parse_start; goto defchar; } /* end of switch on a \foo sequence */ break; @@ -14353,8 +14198,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* Allocate an EXACT node. The node_type may change below to * another EXACTish node, but since the size of the node doesn't * change, it works */ - ret = regnode_guts(pRExC_state, node_type, current_string_nodes, - "exact"); + ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes); FILL_NODE(ret, node_type); RExC_emit++; @@ -14492,7 +14336,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto loopdone; } p = RExC_parse; - RExC_parse = parse_start; + RExC_parse = atom_parse_start; /* The \N{} means the pattern, if previously /d, * becomes /u. That means it can't be an EXACTF node, @@ -14682,7 +14526,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * string of characters instead of a meta construct */ if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) { if ( RExC_strict - || ( p > parse_start + 1 + || ( p > atom_parse_start + 1 && isALPHA_A(*(p - 1)) && *(p - 2) == '\\')) { @@ -15599,7 +15443,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp |= HASWIDTH | maybe_SIMPLE; } - Set_Node_Length(REGNODE_p(ret), p - parse_start - 1); RExC_parse = p; { @@ -16551,8 +16394,7 @@ S_regex_set_precedence(const U8 my_operator) { STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, - I32 *flagp, U32 depth, - char * const oregcomp_parse) + I32 *flagp, U32 depth) { /* Handle the (?[...]) construct to do set operations */ @@ -16581,7 +16423,6 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; - PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */ DEBUG_PARSE("xcls"); @@ -17236,7 +17077,6 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } nextchar(pRExC_state); - Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */ return node; regclass_failed: @@ -19290,8 +19130,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* If optimized to something else and emitted, clean up and return */ if (ret >= 0) { - Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start, - RExC_parse - orig_parse);; SvREFCNT_dec(cp_list);; SvREFCNT_dec(only_utf8_locale_list); SvREFCNT_dec(upper_latin1_only_utf8_matches); @@ -19318,7 +19156,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } - ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof"); + ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); FILL_NODE(ret, op); /* We set the argument later */ RExC_emit += 1 + regarglen[op]; ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags; @@ -19855,7 +19693,7 @@ S_optimize_regclass(pTHX_ len = (UTF) ? UVCHR_SKIP(value) : 1; - *ret = regnode_guts(pRExC_state, op, len, "exact"); + *ret = REGNODE_GUTS(pRExC_state, op, len); FILL_NODE(*ret, op); RExC_emit += 1 + STR_SZ(len); setSTR_LEN(REGNODE_p(*ret), len); @@ -20202,9 +20040,8 @@ S_optimize_regclass(pTHX_ } else { op = ANYOFHs; - *ret = regnode_guts(pRExC_state, op, - regarglen[op] + STR_SZ(len), - "anyofhs"); + *ret = REGNODE_GUTS(pRExC_state, op, + regarglen[op] + STR_SZ(len)); FILL_NODE(*ret, op); ((struct regnode_anyofhs *) REGNODE_p(*ret))->str_len = len; @@ -20712,54 +20549,39 @@ S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) if (size > 0) { Zero(REGNODE_p(RExC_emit), size, regnode); } - -#ifdef RE_TRACK_PATTERN_OFFSETS - Renew(RExC_offsets, 2*RExC_size+1, U32); - if (size > 0) { - Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32); - } - RExC_offsets[0] = RExC_size; -#endif } STATIC regnode_offset -S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name) +S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size) { - /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode - * equivalents space. It aligns and increments RExC_size + /* Allocate a regnode that is (1 + extra_size) times as big as the + * smallest regnode worth of space, and also aligns and increments + * RExC_size appropriately. * * It returns the regnode's offset into the regex engine program */ const regnode_offset ret = RExC_emit; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - PERL_ARGS_ASSERT_REGNODE_GUTS; SIZE_ALIGN(RExC_size); change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size); NODE_ALIGN_FILL(REGNODE_p(ret)); -#ifndef RE_TRACK_PATTERN_OFFSETS - PERL_UNUSED_ARG(name); - PERL_UNUSED_ARG(op); -#else + return(ret); +} + +#ifdef DEBUGGING + +STATIC regnode_offset +S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) { + PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG; assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF); + return S_regnode_guts(aTHX_ pRExC_state, extra_size); +} - if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG( - ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n", - name, __LINE__, - PL_reg_name[op], - (UV)(RExC_emit) > RExC_offsets[0] - ? "Overwriting end of array!\n" : "OK", - (UV)(RExC_emit), - (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); - Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END)); - } #endif - return(ret); -} + + /* - reg_node - emit a node @@ -20767,7 +20589,7 @@ S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_ STATIC regnode_offset /* Location. */ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) { - const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node"); + const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); regnode_offset ptr = ret; PERL_ARGS_ASSERT_REG_NODE; @@ -20785,7 +20607,7 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) STATIC regnode_offset /* Location. */ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) { - const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode"); + const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); regnode_offset ptr = ret; PERL_ARGS_ASSERT_REGANODE; @@ -20804,7 +20626,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) STATIC regnode_offset /* Location. */ S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg) { - const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode"); + const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); regnode_offset ptr = ret; PERL_ARGS_ASSERT_REGPNODE; @@ -20819,7 +20641,7 @@ S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const { /* emit a node with U32 and I32 arguments */ - const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode"); + const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); regnode_offset ptr = ret; PERL_ARGS_ASSERT_REG2LANODE; @@ -20901,41 +20723,9 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, while (src > REGNODE_p(operand)) { StructCopy(--src, --dst, regnode); -#ifdef RE_TRACK_PATTERN_OFFSETS - if (RExC_offsets) { /* MJD 20010112 */ - MJD_OFFSET_DEBUG( - ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n", - "reginsert", - __LINE__, - PL_reg_name[op], - (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0] - ? "Overwriting end of array!\n" : "OK", - (UV)REGNODE_OFFSET(src), - (UV)REGNODE_OFFSET(dst), - (UV)RExC_offsets[0])); - Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src)); - Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src)); - } -#endif } place = REGNODE_p(operand); /* Op node, where operand used to be. */ -#ifdef RE_TRACK_PATTERN_OFFSETS - if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG( - ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n", - "reginsert", - __LINE__, - PL_reg_name[op], - (UV)REGNODE_OFFSET(place) > RExC_offsets[0] - ? "Overwriting end of array!\n" : "OK", - (UV)REGNODE_OFFSET(place), - (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); - Set_Node_Offset(place, RExC_parse); - Set_Node_Length(place, 1); - } -#endif src = NEXTOPER(place); FLAGS(place) = 0; FILL_NODE(operand, op); @@ -22065,10 +21855,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) } }); -#ifdef RE_TRACK_PATTERN_OFFSETS - if (ri->u.offsets) - Safefree(ri->u.offsets); /* 20010421 MJD */ -#endif if (ri->code_blocks) S_free_codeblocks(aTHX_ ri->code_blocks); @@ -22393,14 +22179,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) reti->name_list_idx = ri->name_list_idx; -#ifdef RE_TRACK_PATTERN_OFFSETS - if (ri->u.offsets) { - Newx(reti->u.offsets, 2*len+1, U32); - Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); - } -#else SetProgLen(reti, len); -#endif return (void*)reti; } diff --git a/regcomp.h b/regcomp.h index 552cd6ed65f4..2705463474e7 100644 --- a/regcomp.h +++ b/regcomp.h @@ -26,11 +26,6 @@ /* Not for production use: */ #define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0 -/* Activate offsets code - set to if 1 to enable */ -#ifdef DEBUGGING -#define RE_TRACK_PATTERN_OFFSETS -#endif - /* * Structure for regexp "program". This is essentially a linear encoding * of a nondeterministic finite-state machine (aka syntax charts or @@ -65,24 +60,21 @@ /* This is the stuff that used to live in regexp.h that was truly private to the engine itself. It now lives here. */ - typedef struct regexp_internal { - union { - U32 *offsets; /* offset annotations 20001228 MJD - data about mapping the program to the - string - - offsets[0] is proglen when this is used - */ - U32 proglen; - } u; - +typedef struct regexp_internal { regnode *regstclass; /* Optional startclass as identified or constructed by the optimiser */ struct reg_data *data; /* Additional miscellaneous data used by the program. Used to make it easier to clone and free arbitrary data that the regops need. Often the ARG field of - a regop is an index into this structure */ + a regop is an index into this structure. NOTE the + 0th element of this structure is NEVER used and is + strictly reserved for internal purposes. */ struct reg_code_blocks *code_blocks;/* positions of literal (?{}) */ - int name_list_idx; /* Optional data index of an array of paren names */ + U32 proglen; /* size of the compiled program in regnodes */ + U32 name_list_idx; /* Optional data index of an array of paren names, + only valid when RXp_PAREN_NAMES(prog) is true, + 0 means "no value" like any other index into the + data array.*/ regnode program[1]; /* Unwarranted chumminess with compiler. */ } regexp_internal; @@ -995,7 +987,6 @@ further group, as currently only the low three bytes are used. PEEP TRIE PROGRAM - OFFSETS Execute Options: @@ -1006,7 +997,6 @@ further group, as currently only the low three bytes are used. Extra Options TRIE - OFFSETS If you modify any of these make sure you make corresponding changes to re.pm, especially to the documentation. @@ -1032,8 +1022,6 @@ re.pm, especially to the documentation. /* Extra */ #define RE_DEBUG_EXTRA_MASK 0x3FF0000 #define RE_DEBUG_EXTRA_TRIE 0x0010000 -#define RE_DEBUG_EXTRA_OFFSETS 0x0020000 -#define RE_DEBUG_EXTRA_OFFDEBUG 0x0040000 #define RE_DEBUG_EXTRA_STATE 0x0080000 #define RE_DEBUG_EXTRA_OPTIMISE 0x0100000 #define RE_DEBUG_EXTRA_BUFFERS 0x0400000 @@ -1072,8 +1060,6 @@ re.pm, especially to the documentation. /* Extra */ #define DEBUG_EXTRA_r(x) DEBUG_r( \ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_MASK)) x ) -#define DEBUG_OFFSETS_r(x) DEBUG_r( \ - if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_OFFSETS)) x ) #define DEBUG_STATE_r(x) DEBUG_r( \ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STATE)) x ) #define DEBUG_STACK_r(x) DEBUG_r( \ @@ -1084,9 +1070,6 @@ re.pm, especially to the documentation. #define DEBUG_OPTIMISE_MORE_r(x) DEBUG_r( \ if (DEBUG_v_TEST || ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \ RE_DEBUG_FLAG(RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE))) x ) -#define MJD_OFFSET_DEBUG(x) DEBUG_r( \ - if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_OFFDEBUG)) \ - Perl_warn_nocontext x ) #define DEBUG_TRIE_COMPILE_MORE_r(x) DEBUG_TRIE_COMPILE_r( \ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_TRIE)) x ) #define DEBUG_TRIE_EXECUTE_MORE_r(x) DEBUG_TRIE_EXECUTE_r( \