--- gforth/utf-8.fs 2006/08/26 12:39:57 1.24 +++ gforth/utf-8.fs 2007/12/31 18:40:24 1.34 @@ -1,12 +1,12 @@ \ UTF-8 handling 12dec04py -\ Copyright (C) 2004,2005 Free Software Foundation, Inc. +\ Copyright (C) 2004,2005,2006,2007 Free Software Foundation, Inc. \ This file is part of Gforth. \ Gforth is free software; you can redistribute it and/or \ modify it under the terms of the GNU General Public License -\ as published by the Free Software Foundation; either version 2 +\ as published by the Free Software Foundation, either version 3 \ of the License, or (at your option) any later version. \ This program is distributed in the hope that it will be useful, @@ -15,8 +15,7 @@ \ GNU General Public License for more details. \ You should have received a copy of the GNU General Public License -\ along with this program; if not, write to the Free Software -\ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. +\ along with this program. If not, see http://www.gnu.org/licenses/. \ short: u8 means utf-8 encoded address @@ -71,8 +70,11 @@ $80 Value max-single-byte \ utf key and emit +Defer check-xy ' noop IS check-xy + : u8key ( -- u ) defers key dup max-single-byte u< ?EXIT \ special case ASCII + dup $FF = ?EXIT \ special resize character dup $C2 u< IF UTF-8-err throw THEN \ malformed character $7F and $40 >r BEGIN dup r@ and WHILE r@ xor @@ -91,11 +93,10 @@ $80 Value max-single-byte \ utf-8 stuff for xchars -: +u8/string ( c-addr1 u1 -- c-addr2 u2 ) +: +u8/string ( xc-addr1 u1 -- xc-addr2 u2 ) over dup u8>> swap - /string ; - -: -u8/string ( c-addr1 u1 -- c-addr2 u2 ) - over dup u8<< swap - /string ; +: u8\string- ( xcaddr u -- xcaddr u' ) + over + u8<< over - ; : u8@ ( c-addr -- u ) u8@+ nip ; @@ -108,7 +109,7 @@ $80 Value max-single-byte >r u8!+ r> r> swap - true then ; -: u8addrlen ( u8-addr -- u ) +: u8addrlen ( u8-addr u -- u ) drop \ length of UTF-8 char starting at u8-addr (accesses only u8-addr) c@ dup $80 u< if drop 1 exit endif @@ -122,7 +123,7 @@ $80 Value max-single-byte : -u8trailing-garbage ( addr u1 -- addr u2 ) 2dup + dup u8<< ( addr u1 end1 end2 ) - 2dup dup u8addrlen + = if \ last character ok + 2dup dup over over - u8addrlen + = if \ last character ok 2drop else nip nip over - @@ -287,8 +288,14 @@ here wc-table - Constant #wc-table ['] u8key is xkey ['] u8>> is xchar+ ['] u8<< is xchar- +[ [IFDEF] xstring+ ] + ['] u8\string- is xstring- + ['] +u8/string is +xstring +[ [THEN] ] +[ [IFDEF] +x/string ] + ['] u8\string- is x\string- ['] +u8/string is +x/string - ['] -u8/string is -x/string +[ [THEN] ] ['] u8@ is xc@ ['] u8!+? is xc!+? ['] u8@+ is xc@+ @@ -296,6 +303,9 @@ here wc-table - Constant #wc-table [ [IFDEF] x-width ] ['] u8width is x-width [ [THEN] ] +[ [IFDEF] x-size ] + ['] u8addrlen is x-size +[ [THEN] ] ['] -u8trailing-garbage is -trailing-garbage ; @@ -307,6 +317,16 @@ here wc-table - Constant #wc-table s" UTF-8" search nip nip IF set-encoding-utf-8 ELSE set-encoding-fixed-width THEN ; +environment-wordlist set-current +: xchar-encoding ( -- addr u ) \ xchar-ext + \G Returns a printable ASCII string that reperesents the encoding, + \G and use the preferred MIME name (if any) or the name in + \G @url{http://www.iana.org/assignments/character-sets} like + \G ``ISO-LATIN-1'' or ``UTF-8'', with the exception of ``ASCII'', where + \G we prefer the alias ``ASCII''. + max-single-byte $80 = IF s" UTF-8" ELSE s" ISO-LATIN-1" THEN ; +forth definitions + :noname ( -- ) defers 'cold utf-8-cold