--- gforth/regexp.fs 2006/12/31 13:39:13 1.7 +++ gforth/regexp.fs 2007/10/03 18:41:09 1.8 @@ -29,9 +29,12 @@ \ special control structure -: FORK ( compilation -- orig ; run-time f -- ) \ core +: FORK ( compilation -- orig ; run-time f -- ) \ gforth + \G AHEAD-like control structure: calls the code after JOIN. POSTPONE call >mark ; immediate restrict -: JOIN ( orig -- ) postpone THEN ; immediate restrict +: JOIN ( orig -- ) \ gforth + \G THEN-like control structure for FORK + postpone THEN ; immediate restrict \ Charclasses @@ -40,24 +43,40 @@ : @+ ( addr -- n addr' ) dup @ swap cell+ ; 0 Value cur-class -: charclass ( -- ) Create here dup to cur-class $100 dup allot erase ; -: +char ( char -- ) cur-class swap +bit ; -: -char ( char -- ) cur-class swap -bit ; -: ..char ( start end -- ) 1+ swap ?DO I +char LOOP ; +: charclass ( -- ) \ regexp-cg + \G Create a charclass + Create here dup to cur-class $100 dup allot erase ; +: +char ( char -- ) \ regexp-cg + \G add a char to the current charclass + cur-class swap +bit ; +: -char ( char -- ) \ regexp-cg + \G remove a char from the current charclass + cur-class swap -bit ; +: ..char ( start end -- ) \ regexp-cg + \G add a range of chars to the current charclass + 1+ swap ?DO I +char LOOP ; : or! ( n addr -- ) dup @ rot or swap ! ; : and! ( n addr -- ) dup @ rot and swap ! ; -: +class ( class -- ) $100 0 ?DO @+ swap - cur-class I + or! cell +LOOP drop ; -: -class ( class -- ) $100 0 ?DO @+ swap invert - cur-class I + and! cell +LOOP drop ; +: +class ( class -- ) \ regexp-cg + \G union of charclass @var{class} and the current charclass + $100 0 ?DO @+ swap + cur-class I + or! cell +LOOP drop ; +: -class ( class -- ) \ regexp-cg + \G subtract the charclass @var{class} from the current charclass + $100 0 ?DO @+ swap invert + cur-class I + and! cell +LOOP drop ; : char? ( addr class -- addr' flag ) >r count r> + c@ ; \ Charclass tests -: c? ( addr class -- ) ]] char? 0= ?LEAVE [[ ; immediate -: -c? ( addr class -- ) ]] char? ?LEAVE [[ ; immediate +: c? ( addr class -- ) \ regexp-pattern + \G check @var{addr} for membership in charclass @var{class} + ]] char? 0= ?LEAVE [[ ; immediate +: -c? ( addr class -- ) \ regexp-pattern + \G check @var{addr} for not membership in charclass @var{class} + ]] char? ?LEAVE [[ ; immediate charclass digit '0 '9 ..char charclass blanks 0 bl ..char @@ -65,19 +84,32 @@ charclass blanks 0 bl ..char charclass letter 'a 'z ..char 'A 'Z ..char charclass any 0 $FF ..char #lf -char -: \d ( addr -- addr' ) ]] digit c? [[ ; immediate -: \s ( addr -- addr' ) ]] blanks c? [[ ; immediate -: .? ( addr -- addr' ) ]] any c? [[ ; immediate -: -\d ( addr -- addr' ) ]] digit -c? [[ ; immediate -: -\s ( addr -- addr' ) ]] blanks -c? [[ ; immediate -: ` ( -- ) +: \d ( addr -- addr' ) \ regexp-pattern + \G check for digit + ]] digit c? [[ ; immediate +: \s ( addr -- addr' ) \ regexp-pattern + \G check for blanks + ]] blanks c? [[ ; immediate +: .? ( addr -- addr' ) \ regexp-pattern + \G check for any single charachter + ]] any c? [[ ; immediate +: -\d ( addr -- addr' ) \ regexp-pattern + \G check for not digit + ]] digit -c? [[ ; immediate +: -\s ( addr -- addr' ) \ regexp-pattern + \G check for not blank + ]] blanks -c? [[ ; immediate +: ` ( "char" -- ) \ regexp-pattern + \G check for particular char ]] count [[ char ]] Literal <> ?LEAVE [[ ; immediate \ A word for string comparison : $= ( addr1 addr2 u -- f ) tuck compare ; : ,=" ( addr u -- ) tuck ]] dup SLiteral $= ?LEAVE Literal + noop [[ ; -: =" ( " -- ) '" parse ,=" ; immediate +: =" ( " -- ) \ regexp-pattern + \G check for string + '" parse ,=" ; immediate \ loop stack @@ -112,9 +144,11 @@ Variable varsmax \ start and end -: \^ ( addr -- addr ) +: \^ ( addr -- addr ) \ regexp-pattern + \G check for string start ]] ^? ?LEAVE [[ ; immediate -: \$ ( addr -- addr ) +: \$ ( addr -- addr ) \ regexp-pattern + \G check for string end ]] $? ?LEAVE [[ ; immediate \ regexp block @@ -122,9 +156,12 @@ Variable varsmax \ FORK/JOIN are like AHEAD THEN, but producing a call on AHEAD \ instead of a jump. -: (( ( addr u -- ) vars off varsmax off loops off +: (( ( addr u -- ) \ regexp-pattern + \G start regexp block + vars off varsmax off loops off ]] FORK AHEAD BUT JOIN !end [[ BEGIN, ; immediate -: )) ( -- addr f ) +: )) ( -- addr f ) \ regexp-pattern + \G end regexp block ]] ?end drop true EXIT [[ DONE, ]] drop false EXIT THEN [[ ; immediate @@ -135,35 +172,50 @@ Variable varsmax : drops ( n -- ) 1+ cells sp@ + sp! ; -: {** ( addr -- addr addr ) +: {** ( addr -- addr addr ) \ regexp-pattern + \G greedy zero-or-more pattern 0 ]] Literal >r BEGIN dup [[ BEGIN, ; immediate -' {** Alias {++ ( addr -- addr addr ) immediate -: n*} ( sys n -- ) >r ]] r> 1+ >r $? 0= UNTIL dup [[ DONE, ]] drop [[ +' {** Alias {++ ( addr -- addr addr ) \ regexp-pattern + \G greedy one-or-more pattern + immediate +: n*} ( sys n -- ) \ regexp-pattern + \G At least @var{n} pattern + >r ]] r> 1+ >r $? 0= UNTIL dup [[ DONE, ]] drop [[ r@ IF r@ ]] r@ Literal u< IF r> 1+ drops false EXIT THEN [[ THEN r@ ]] r> 1+ Literal U+DO FORK BUT [[ ]] IF I' I - [[ r@ 1- ]] Literal + drops true UNLOOP EXIT THEN LOOP [[ r@ IF r@ ]] Literal drops [[ THEN rdrop ]] false EXIT JOIN [[ ; immediate -: **} 0 postpone n*} ; immediate -: ++} 1 postpone n*} ; immediate +: **} ( sys -- ) \ regexp-pattern + \G end of greedy zero-or-more pattern + 0 postpone n*} ; immediate +: ++} ( sys -- ) \ regexp-pattern + \G end of greedy zero-or-more pattern + 1 postpone n*} ; immediate \ non-greedy loops \ Idea: Try to match rest of the regexp, and if that fails, try match \ first expr and then try again rest of regexp. -: {+ ( addr -- addr addr ) +: {+ ( addr -- addr addr ) \ regexp-pattern + \G non-greedy one-or-more pattern ]] BEGIN [[ BEGIN, ; immediate -: {* ( addr -- addr addr ) +: {* ( addr -- addr addr ) \ regexp-pattern + \G non-greedy zero-or-more pattern ]] {+ dup FORK BUT IF drop true EXIT THEN [[ ; immediate -: *} ( addr addr' -- addr' ) +: *} ( addr addr' -- addr' ) \ regexp-pattern + \G end of non-greedy zero-or-more pattern ]] dup end$ u> UNTIL [[ DONE, ]] drop false EXIT JOIN [[ ; immediate -: +} ( addr addr' -- addr' ) +: +} ( addr addr' -- addr' ) \ regexp-pattern + \G end of non-greedy one-or-more pattern ]] dup FORK BUT IF drop true EXIT [[ DONE, ]] drop false EXIT THEN *} [[ ; immediate -: // ( -- ) ]] {* 1+ *} [[ ; immediate +: // ( -- ) \ regexp-pattern + \G search for string + ]] {* 1+ *} [[ ; immediate \ alternatives @@ -172,21 +224,33 @@ Variable varsmax : THENs ( sys -- ) BEGIN dup WHILE ]] THEN [[ REPEAT drop ; -: {{ ( addr -- addr addr ) 0 ]] dup BEGIN [[ vars @ ; immediate -: || ( addr addr -- addr addr ) vars @ varsmax @ max varsmax ! +: {{ ( addr -- addr addr ) \ regexp-pattern + \G Start of alternatives + 0 ]] dup BEGIN [[ vars @ ; immediate +: || ( addr addr -- addr addr ) \ regexp-pattern + \G separator between alternatives + vars @ varsmax @ max varsmax ! ]] nip AHEAD [[ >r >r >r vars ! ]] DONE drop dup [[ r> r> r> ]] BEGIN [[ vars @ ; immediate -: }} ( addr addr -- addr addr ) vars @ varsmax @ max vars ! +: }} ( addr addr -- addr addr ) \ regexp-pattern + \G end of alternatives + vars @ varsmax @ max vars ! ]] nip AHEAD [[ >r >r >r drop ]] DONE drop LEAVE [[ r> r> r> THENs ; immediate \ match variables -: \( ( addr -- addr ) ]] dup [[ +: \( ( addr -- addr ) \ regexp-pattern + \G start of matching variable; variables are referred as \\1--9 + ]] dup [[ >var ]] ALiteral ! [[ ; immediate -: \) ( addr -- addr ) ]] dup [[ +: \) ( addr -- addr ) \ regexp-pattern + \G end of matching variable + ]] dup [[ var> ]] ALiteral ! [[ ; immediate -: \0 ( -- addr u ) start$ end$ over - ; +: \0 ( -- addr u ) \ regexp-pattern + \G the whole string + start$ end$ over - ; : \: ( i -- ) Create 2* 1+ cells vars + , DOES> ( -- addr u ) @ 2@ tuck - ; @@ -200,17 +264,28 @@ require string.fs 0 Value >>ptr 0 Value <>string -: >> ( addr -- addr ) dup to >>ptr ; -: << ( run-addr addr u -- run-addr ) +: >> ( addr -- addr ) \ regexp-replace + \G Start replace pattern region + dup to >>ptr ; +: << ( run-addr addr u -- run-addr ) \ regexp-replace + \G Replace string from start of replace pattern region with + \G @var{addr} @var{u} <>string @ 0= IF s" " >>string $! THEN <>ptr over - >>string $+! >>string $+! dup to <" -- ) \ regexp-replace + \G Replace string from start of replace pattern region with + \G @var{string} + '" parse postpone SLiteral postpone << ; immediate : >>string@ ( -- addr u ) >>string $@ >>string off 0 to >>ptr 0 to <>next ( -- addr u ) <>next REPEAT end$ [[ +: s// ( -- sys ) \ regexp-replace + \G start search/replace loop + ]] BEGIN [[ ; immediate +: //g ( sys -- ) \ regexp-replace + \G end search/replace loop + ]] WHILE >>next REPEAT end$ [[ s" " ]] SLiteral << >>string@ rot drop [[ ; immediate