--- gforth/utf-8.fs	2006/08/26 12:39:57	1.24
+++ gforth/utf-8.fs	2007/12/31 18:40:24	1.34
@@ -1,12 +1,12 @@
 \ UTF-8 handling                                       12dec04py
 
-\ Copyright (C) 2004,2005 Free Software Foundation, Inc.
+\ Copyright (C) 2004,2005,2006,2007 Free Software Foundation, Inc.
 
 \ This file is part of Gforth.
 
 \ Gforth is free software; you can redistribute it and/or
 \ modify it under the terms of the GNU General Public License
-\ as published by the Free Software Foundation; either version 2
+\ as published by the Free Software Foundation, either version 3
 \ of the License, or (at your option) any later version.
 
 \ This program is distributed in the hope that it will be useful,
@@ -15,8 +15,7 @@
 \ GNU General Public License for more details.
 
 \ You should have received a copy of the GNU General Public License
-\ along with this program; if not, write to the Free Software
-\ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
+\ along with this program. If not, see http://www.gnu.org/licenses/.
 
 \ short: u8 means utf-8 encoded address
 
@@ -71,8 +70,11 @@ $80 Value max-single-byte
 
 \ utf key and emit
 
+Defer check-xy  ' noop IS check-xy
+
 : u8key ( -- u )
     defers key dup max-single-byte u< ?EXIT  \ special case ASCII
+    dup $FF = ?EXIT  \ special resize character
     dup $C2 u< IF  UTF-8-err throw  THEN  \ malformed character
     $7F and  $40 >r
     BEGIN  dup r@ and  WHILE  r@ xor
@@ -91,11 +93,10 @@ $80 Value max-single-byte
 
 \ utf-8 stuff for xchars
 
-: +u8/string ( c-addr1 u1 -- c-addr2 u2 )
+: +u8/string ( xc-addr1 u1 -- xc-addr2 u2 )
     over dup u8>> swap - /string ;
-
-: -u8/string ( c-addr1 u1 -- c-addr2 u2 )
-    over dup u8<< swap - /string ;
+: u8\string- ( xcaddr u -- xcaddr u' )
+    over + u8<< over - ;
 
 : u8@ ( c-addr -- u )
     u8@+ nip ;
@@ -108,7 +109,7 @@ $80 Value max-single-byte
 	>r u8!+ r> r> swap - true
     then ;
 
-: u8addrlen ( u8-addr -- u )
+: u8addrlen ( u8-addr u -- u )  drop
     \ length of UTF-8 char starting at u8-addr (accesses only u8-addr)
     c@
     dup $80 u< if drop 1 exit endif
@@ -122,7 +123,7 @@ $80 Value max-single-byte
 
 : -u8trailing-garbage ( addr u1 -- addr u2 )
     2dup + dup u8<< ( addr u1 end1 end2 )
-    2dup dup u8addrlen + = if \ last character ok
+    2dup dup over over - u8addrlen + = if \ last character ok
 	2drop
     else
 	nip nip over -
@@ -287,8 +288,14 @@ here wc-table - Constant #wc-table
     ['] u8key is xkey
     ['] u8>> is xchar+
     ['] u8<< is xchar-
+[ [IFDEF] xstring+ ]
+    ['] u8\string- is xstring-
+    ['] +u8/string is +xstring
+[ [THEN] ]
+[ [IFDEF] +x/string ]
+    ['] u8\string- is x\string-
     ['] +u8/string is +x/string
-    ['] -u8/string is -x/string
+[ [THEN] ]
     ['] u8@ is xc@
     ['] u8!+? is xc!+?
     ['] u8@+ is xc@+
@@ -296,6 +303,9 @@ here wc-table - Constant #wc-table
 [ [IFDEF] x-width ]
     ['] u8width is x-width
 [ [THEN] ]
+[ [IFDEF] x-size ]
+    ['] u8addrlen is x-size
+[ [THEN] ]
     ['] -u8trailing-garbage is -trailing-garbage
 ;
 
@@ -307,6 +317,16 @@ here wc-table - Constant #wc-table
     s" UTF-8" search nip nip
     IF  set-encoding-utf-8  ELSE  set-encoding-fixed-width  THEN ;
 
+environment-wordlist set-current
+: xchar-encoding ( -- addr u ) \ xchar-ext
+    \G Returns a printable ASCII string that reperesents the encoding,
+    \G and use the preferred MIME name (if any) or the name in
+    \G @url{http://www.iana.org/assignments/character-sets} like
+    \G ``ISO-LATIN-1'' or ``UTF-8'', with the exception of ``ASCII'', where
+    \G we prefer the alias ``ASCII''.
+    max-single-byte $80 = IF s" UTF-8" ELSE s" ISO-LATIN-1" THEN ;
+forth definitions
+
 :noname ( -- )
     defers 'cold
     utf-8-cold