Diff for /gforth/utf-8.fs between versions 1.24 and 1.34

version 1.24, 2006/08/26 12:39:57 version 1.34, 2007/12/31 18:40:24
Line 1 Line 1
 \ UTF-8 handling                                       12dec04py  \ UTF-8 handling                                       12dec04py
   
 \ Copyright (C) 2004,2005 Free Software Foundation, Inc.  \ Copyright (C) 2004,2005,2006,2007 Free Software Foundation, Inc.
   
 \ This file is part of Gforth.  \ This file is part of Gforth.
   
 \ Gforth is free software; you can redistribute it and/or  \ Gforth is free software; you can redistribute it and/or
 \ modify it under the terms of the GNU General Public License  \ modify it under the terms of the GNU General Public License
 \ as published by the Free Software Foundation; either version 2  \ as published by the Free Software Foundation, either version 3
 \ of the License, or (at your option) any later version.  \ of the License, or (at your option) any later version.
   
 \ This program is distributed in the hope that it will be useful,  \ This program is distributed in the hope that it will be useful,
Line 15 Line 15
 \ GNU General Public License for more details.  \ GNU General Public License for more details.
   
 \ You should have received a copy of the GNU General Public License  \ You should have received a copy of the GNU General Public License
 \ along with this program; if not, write to the Free Software  \ along with this program. If not, see http://www.gnu.org/licenses/.
 \ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.  
   
 \ short: u8 means utf-8 encoded address  \ short: u8 means utf-8 encoded address
   
Line 71  $80 Value max-single-byte Line 70  $80 Value max-single-byte
   
 \ utf key and emit  \ utf key and emit
   
   Defer check-xy  ' noop IS check-xy
   
 : u8key ( -- u )  : u8key ( -- u )
     defers key dup max-single-byte u< ?EXIT  \ special case ASCII      defers key dup max-single-byte u< ?EXIT  \ special case ASCII
       dup $FF = ?EXIT  \ special resize character
     dup $C2 u< IF  UTF-8-err throw  THEN  \ malformed character      dup $C2 u< IF  UTF-8-err throw  THEN  \ malformed character
     $7F and  $40 >r      $7F and  $40 >r
     BEGIN  dup r@ and  WHILE  r@ xor      BEGIN  dup r@ and  WHILE  r@ xor
Line 91  $80 Value max-single-byte Line 93  $80 Value max-single-byte
   
 \ utf-8 stuff for xchars  \ utf-8 stuff for xchars
   
 : +u8/string ( c-addr1 u1 -- c-addr2 u2 )  : +u8/string ( xc-addr1 u1 -- xc-addr2 u2 )
     over dup u8>> swap - /string ;      over dup u8>> swap - /string ;
   : u8\string- ( xcaddr u -- xcaddr u' )
 : -u8/string ( c-addr1 u1 -- c-addr2 u2 )      over + u8<< over - ;
     over dup u8<< swap - /string ;  
   
 : u8@ ( c-addr -- u )  : u8@ ( c-addr -- u )
     u8@+ nip ;      u8@+ nip ;
Line 108  $80 Value max-single-byte Line 109  $80 Value max-single-byte
         >r u8!+ r> r> swap - true          >r u8!+ r> r> swap - true
     then ;      then ;
   
 : u8addrlen ( u8-addr -- u )  : u8addrlen ( u8-addr u -- u )  drop
     \ length of UTF-8 char starting at u8-addr (accesses only u8-addr)      \ length of UTF-8 char starting at u8-addr (accesses only u8-addr)
     c@      c@
     dup $80 u< if drop 1 exit endif      dup $80 u< if drop 1 exit endif
Line 122  $80 Value max-single-byte Line 123  $80 Value max-single-byte
   
 : -u8trailing-garbage ( addr u1 -- addr u2 )  : -u8trailing-garbage ( addr u1 -- addr u2 )
     2dup + dup u8<< ( addr u1 end1 end2 )      2dup + dup u8<< ( addr u1 end1 end2 )
     2dup dup u8addrlen + = if \ last character ok      2dup dup over over - u8addrlen + = if \ last character ok
         2drop          2drop
     else      else
         nip nip over -          nip nip over -
Line 287  here wc-table - Constant #wc-table Line 288  here wc-table - Constant #wc-table
     ['] u8key is xkey      ['] u8key is xkey
     ['] u8>> is xchar+      ['] u8>> is xchar+
     ['] u8<< is xchar-      ['] u8<< is xchar-
   [ [IFDEF] xstring+ ]
       ['] u8\string- is xstring-
       ['] +u8/string is +xstring
   [ [THEN] ]
   [ [IFDEF] +x/string ]
       ['] u8\string- is x\string-
     ['] +u8/string is +x/string      ['] +u8/string is +x/string
     ['] -u8/string is -x/string  [ [THEN] ]
     ['] u8@ is xc@      ['] u8@ is xc@
     ['] u8!+? is xc!+?      ['] u8!+? is xc!+?
     ['] u8@+ is xc@+      ['] u8@+ is xc@+
Line 296  here wc-table - Constant #wc-table Line 303  here wc-table - Constant #wc-table
 [ [IFDEF] x-width ]  [ [IFDEF] x-width ]
     ['] u8width is x-width      ['] u8width is x-width
 [ [THEN] ]  [ [THEN] ]
   [ [IFDEF] x-size ]
       ['] u8addrlen is x-size
   [ [THEN] ]
     ['] -u8trailing-garbage is -trailing-garbage      ['] -u8trailing-garbage is -trailing-garbage
 ;  ;
   
Line 307  here wc-table - Constant #wc-table Line 317  here wc-table - Constant #wc-table
     s" UTF-8" search nip nip      s" UTF-8" search nip nip
     IF  set-encoding-utf-8  ELSE  set-encoding-fixed-width  THEN ;      IF  set-encoding-utf-8  ELSE  set-encoding-fixed-width  THEN ;
   
   environment-wordlist set-current
   : xchar-encoding ( -- addr u ) \ xchar-ext
       \G Returns a printable ASCII string that reperesents the encoding,
       \G and use the preferred MIME name (if any) or the name in
       \G @url{http://www.iana.org/assignments/character-sets} like
       \G ``ISO-LATIN-1'' or ``UTF-8'', with the exception of ``ASCII'', where
       \G we prefer the alias ``ASCII''.
       max-single-byte $80 = IF s" UTF-8" ELSE s" ISO-LATIN-1" THEN ;
   forth definitions
   
 :noname ( -- )  :noname ( -- )
     defers 'cold      defers 'cold
     utf-8-cold      utf-8-cold

Removed from v.1.24  
changed lines
  Added in v.1.34


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>