--- gforth/utf-8.fs 2006/12/31 13:39:14 1.25 +++ gforth/utf-8.fs 2007/10/03 16:58:15 1.31 @@ -71,8 +71,11 @@ $80 Value max-single-byte \ utf key and emit +Defer check-xy ' noop IS check-xy + : u8key ( -- u ) defers key dup max-single-byte u< ?EXIT \ special case ASCII + dup $FF = ?EXIT \ special resize character dup $C2 u< IF UTF-8-err throw THEN \ malformed character $7F and $40 >r BEGIN dup r@ and WHILE r@ xor @@ -91,11 +94,10 @@ $80 Value max-single-byte \ utf-8 stuff for xchars -: +u8/string ( c-addr1 u1 -- c-addr2 u2 ) +: +u8/string ( xc-addr1 u1 -- xc-addr2 u2 ) over dup u8>> swap - /string ; - -: -u8/string ( c-addr1 u1 -- c-addr2 u2 ) - over dup u8<< swap - /string ; +: u8\string- ( xcaddr u -- xcaddr u' ) + over + u8<< over - ; : u8@ ( c-addr -- u ) u8@+ nip ; @@ -108,7 +110,7 @@ $80 Value max-single-byte >r u8!+ r> r> swap - true then ; -: u8addrlen ( u8-addr -- u ) +: u8addrlen ( u8-addr u -- u ) drop \ length of UTF-8 char starting at u8-addr (accesses only u8-addr) c@ dup $80 u< if drop 1 exit endif @@ -122,7 +124,7 @@ $80 Value max-single-byte : -u8trailing-garbage ( addr u1 -- addr u2 ) 2dup + dup u8<< ( addr u1 end1 end2 ) - 2dup dup u8addrlen + = if \ last character ok + 2dup dup over over - u8addrlen + = if \ last character ok 2drop else nip nip over - @@ -287,8 +289,14 @@ here wc-table - Constant #wc-table ['] u8key is xkey ['] u8>> is xchar+ ['] u8<< is xchar- +[ [IFDEF] xstring+ ] + ['] u8\string- is xstring- + ['] +u8/string is +xstring +[ [THEN] ] +[ [IFDEF] x/string+ ] + ['] u8\string- is x\string- ['] +u8/string is +x/string - ['] -u8/string is -x/string +[ [THEN] ] ['] u8@ is xc@ ['] u8!+? is xc!+? ['] u8@+ is xc@+ @@ -296,6 +304,9 @@ here wc-table - Constant #wc-table [ [IFDEF] x-width ] ['] u8width is x-width [ [THEN] ] +[ [IFDEF] x-size ] + ['] u8addrlen is x-size +[ [THEN] ] ['] -u8trailing-garbage is -trailing-garbage ; @@ -307,6 +318,16 @@ here wc-table - Constant #wc-table s" UTF-8" search nip nip IF set-encoding-utf-8 ELSE set-encoding-fixed-width THEN ; +environment-wordlist set-current +: xchar-encoding ( -- addr u ) \ xchar-ext + \G Returns a printable ASCII string that reperesents the encoding, + \G and use the preferred MIME name (if any) or the name in + \G @url{http://www.iana.org/assignments/character-sets} like + \G ``ISO-LATIN-1'' or ``UTF-8'', with the exception of ``ASCII'', where + \G we prefer the alias ``ASCII''. + max-single-byte $80 = IF s" UTF-8" ELSE s" ISO-LATIN-1" THEN ; +forth definitions + :noname ( -- ) defers 'cold utf-8-cold