--- gforth/utf-8.fs 2008/02/12 16:44:58 1.35 +++ gforth/utf-8.fs 2011/09/01 20:09:23 1.43 @@ -1,6 +1,6 @@ \ UTF-8 handling 12dec04py -\ Copyright (C) 2004,2005,2006,2007 Free Software Foundation, Inc. +\ Copyright (C) 2004,2005,2006,2007,2008,2009,2010 Free Software Foundation, Inc. \ This file is part of Gforth. @@ -73,23 +73,23 @@ $80 Value max-single-byte Defer check-xy ' noop IS check-xy : u8key ( -- u ) - defers key dup max-single-byte u< ?EXIT \ special case ASCII + key dup max-single-byte u< ?EXIT \ special case ASCII dup $FF = ?EXIT \ special resize character dup $C2 u< IF UTF-8-err throw THEN \ malformed character $7F and $40 >r BEGIN dup r@ and WHILE r@ xor - 6 lshift r> 5 lshift >r >r defers key + 6 lshift r> 5 lshift >r >r key dup $C0 and $80 <> IF UTF-8-err throw THEN $3F and r> or REPEAT rdrop ; : u8emit ( u -- ) - dup max-single-byte u< IF defers emit EXIT THEN \ special case ASCII + dup max-single-byte u< IF emit EXIT THEN \ special case ASCII 0 swap $3F BEGIN 2dup u> WHILE 2/ >r dup $3F and $80 or swap 6 rshift r> REPEAT $7F xor 2* or - BEGIN dup $80 u>= WHILE defers emit REPEAT drop ; + BEGIN dup $80 u>= WHILE emit REPEAT drop ; \ utf-8 stuff for xchars @@ -119,7 +119,8 @@ Defer check-xy ' noop IS check-xy dup $f8 u< if drop 4 exit endif dup $fc u< if drop 5 exit endif dup $fe u< if drop 6 exit endif - UTF-8-err throw ; + dup $ff u< if drop 7 exit endif + drop 8 ; : -u8trailing-garbage ( addr u1 -- addr u2 ) 2dup + dup u8<< ( addr u1 end1 end2 ) @@ -272,15 +273,17 @@ here wc-table - Constant #wc-table \ inefficient table walk: -: wcwidth ( xc -- n ) +: xc-width ( xc -- n ) wc-table #wc-table over + swap ?DO dup I 2@ within IF I 2 cells + @ UNLOOP EXIT THEN 3 cells +LOOP 1 ; +[ELSE] + ' wcwidth Alias xc-width [THEN] : u8width ( xcaddr u -- n ) 0 rot rot over + swap ?DO - I xc@+ swap >r wcwidth + + I xc@+ swap >r xc-width + r> I - +LOOP ; : set-encoding-utf-8 ( -- ) @@ -297,6 +300,9 @@ here wc-table - Constant #wc-table ['] +u8/string is +x/string [ [THEN] ] ['] u8@ is xc@ +[ [IFDEF] xc!+ ] + ['] u8!+ is xc!+ +[ [THEN] ] ['] u8!+? is xc!+? ['] u8@+ is xc@+ ['] u8len is xc-size @@ -325,6 +331,9 @@ environment-wordlist set-current \G ``ISO-LATIN-1'' or ``UTF-8'', with the exception of ``ASCII'', where \G we prefer the alias ``ASCII''. max-single-byte $80 = IF s" UTF-8" ELSE s" ISO-LATIN-1" THEN ; +: max-xchar ( -- xchar ) + max-single-byte $80 = IF $7FFFFFFF ELSE $FF THEN ; +' noop Alias X:xchar forth definitions :noname ( -- )