--- gforth/prim	2005/01/25 22:16:29	1.162
+++ gforth/prim	2006/01/28 17:55:31	1.186
@@ -1,6 +1,6 @@
 \ Gforth primitives
 
-\ Copyright (C) 1995,1996,1997,1998,2000,2003,2004 Free Software Foundation, Inc.
+\ Copyright (C) 1995,1996,1997,1998,2000,2003,2004,2005 Free Software Foundation, Inc.
 
 \ This file is part of Gforth.
 
@@ -100,6 +100,8 @@
 \E s" struct F83Name *"	single data-stack type-prefix f83name
 \E s" struct Longname *" single data-stack type-prefix longname
 \E 
+\E data-stack   stack-prefix S:
+\E fp-stack     stack-prefix F:
 \E return-stack stack-prefix R:
 \E inst-stream  stack-prefix #
 \E 
@@ -138,7 +140,7 @@
 
 \ Stack caching setup
 
-ifdef(`M4_ENGINE_FAST', `include(cache1.vmg)', `include(cache0.vmg)')
+ifdef(`STACK_CACHE_FILE', `include(STACK_CACHE_FILE)', `include(cache0.vmg)')
 
 \ these m4 macros would collide with identifiers
 undefine(`index')
@@ -249,7 +251,6 @@ execute	( xt -- )		core
 #ifndef NO_IP
 ip=IP;
 #endif
-IF_spTOS(spTOS = sp[0]); /* inst_tail would produce a NEXT_P1 */
 SUPER_END;
 VM_JUMP(EXEC1(xt));
 
@@ -259,7 +260,6 @@ perform	( a_addr -- )	gforth
 #ifndef NO_IP
 ip=IP;
 #endif
-IF_spTOS(spTOS = sp[0]); /* inst_tail would produce a NEXT_P1 */
 SUPER_END;
 VM_JUMP(EXEC1(*(Xt *)a_addr));
 :
@@ -293,7 +293,6 @@ assert(0);
 #else
 a_pfa = PFA(a_cfa);
 nest = (Cell)IP;
-IF_spTOS(spTOS = sp[0]);
 #ifdef DEBUG
     {
       CFA_TO_NAME(a_cfa);
@@ -410,37 +409,31 @@ condbranch(?branch,f --,f83	question_bra
 
 \+xconds
 
-?dup-?branch	( #a_target f -- f )	new	question_dupe_question_branch
+?dup-?branch	( #a_target f -- S:... )	new	question_dupe_question_branch
 ""The run-time procedure compiled by @code{?DUP-IF}.""
 if (f==0) {
-  sp++;
-  IF_spTOS(spTOS = sp[0]);
 #ifdef NO_IP
 INST_TAIL;
 JUMP(a_target);
 #else
 SET_IP((Xt *)a_target);
-  INST_TAIL; NEXT_P2;
 #endif
+} else {
+sp--;
+sp[0]=f;
 }
-SUPER_CONTINUE;
 
-?dup-0=-?branch ( #a_target f -- ) new	question_dupe_zero_equals_question_branch
+?dup-0=-?branch ( #a_target f -- S:... ) new	question_dupe_zero_equals_question_branch
 ""The run-time procedure compiled by @code{?DUP-0=-IF}.""
-/* the approach taken here of declaring the word as having the stack
-effect ( f -- ) and correcting for it in the branch-taken case costs a
-few cycles in that case, but is easy to convert to a CONDBRANCH
-invocation */
 if (f!=0) {
   sp--;
+  sp[0]=f;
 #ifdef NO_IP
   JUMP(a_target);
 #else
   SET_IP((Xt *)a_target);
-  NEXT;
 #endif
 }
-SUPER_CONTINUE;
 
 \+
 \fhas? skiploopprims 0= [IF]
@@ -521,10 +514,8 @@ if (nstart == nlimit) {
     JUMP(a_target);
 #else
     SET_IP((Xt *)a_target);
-    INST_TAIL; NEXT_P2;
 #endif
 }
-SUPER_CONTINUE;
 :
   2dup =
   IF   r> swap rot >r >r
@@ -544,10 +535,8 @@ if (nstart >= nlimit) {
     JUMP(a_target);
 #else
     SET_IP((Xt *)a_target);
-    INST_TAIL; NEXT_P2;
 #endif
 }
-SUPER_CONTINUE;
 :
  swap 2dup
  r> swap >r swap >r
@@ -567,10 +556,8 @@ if (ustart >= ulimit) {
 JUMP(a_target);
 #else
 SET_IP((Xt *)a_target);
-INST_TAIL; NEXT_P2;
 #endif
 }
-SUPER_CONTINUE;
 :
  swap 2dup
  r> swap >r swap >r
@@ -590,10 +577,8 @@ if (nstart <= nlimit) {
 JUMP(a_target);
 #else
 SET_IP((Xt *)a_target);
-INST_TAIL; NEXT_P2;
 #endif
 }
-SUPER_CONTINUE;
 :
  swap 2dup
  r> swap >r swap >r
@@ -613,10 +598,8 @@ if (ustart <= ulimit) {
 JUMP(a_target);
 #else
 SET_IP((Xt *)a_target);
-INST_TAIL; NEXT_P2;
 #endif
 }
-SUPER_CONTINUE;
 :
  swap 2dup
  r> swap >r swap >r
@@ -817,20 +800,20 @@ n = n1*n2;
 
 /	( n1 n2 -- n )		core	slash
 n = n1/n2;
-if(FLOORED_DIV && (n1 < 0) != (n2 < 0) && (n1%n2 != 0)) n--;
+if(FLOORED_DIV && ((n1^n2) < 0) && (n1%n2 != 0)) n--;
 :
  /mod nip ;
 
 mod	( n1 n2 -- n )		core
 n = n1%n2;
-if(FLOORED_DIV && (n1 < 0) != (n2 < 0) && n!=0) n += n2;
+if(FLOORED_DIV && ((n1^n2) < 0) && n!=0) n += n2;
 :
  /mod drop ;
 
 /mod	( n1 n2 -- n3 n4 )		core		slash_mod
 n4 = n1/n2;
 n3 = n1%n2; /* !! is this correct? look into C standard! */
-if (FLOORED_DIV && (n1<0) != (n2<0) && n3!=0) {
+if (FLOORED_DIV && ((n1^n2) < 0) && n3!=0) {
   n4--;
   n3+=n2;
 }
@@ -852,7 +835,7 @@ n5=DLO(r);
 /* assumes that the processor uses either floored or symmetric division */
 n5 = d/n3;
 n4 = d%n3;
-if (FLOORED_DIV && (d<0) != (n3<0) && n4!=0) {
+if (FLOORED_DIV && ((DHI(d)^n3)<0) && n4!=0) {
   n5--;
   n4+=n3;
 }
@@ -869,11 +852,11 @@ DCell d = (DCell)n1 * (DCell)n2;
 #endif
 #ifdef BUGGY_LL_DIV
 DCell r = fmdiv(d,n3);
-n4=DHI(r);
+n4=DLO(r);
 #else
 /* assumes that the processor uses either floored or symmetric division */
 n4 = d/n3;
-if (FLOORED_DIV && (d<0) != (n3<0) && (d%n3)!=0) n4--;
+if (FLOORED_DIV && ((DHI(d)^n3)<0) && (d%n3)!=0) n4--;
 #endif
 :
  */mod nip ;
@@ -898,18 +881,34 @@ n2 = n1>>1;
 fm/mod	( d1 n1 -- n2 n3 )		core		f_m_slash_mod
 ""Floored division: @i{d1} = @i{n3}*@i{n1}+@i{n2}, @i{n1}>@i{n2}>=0 or 0>=@i{n2}>@i{n1}.""
 #ifdef BUGGY_LL_DIV
+#ifdef ASM_SM_SLASH_REM
+ASM_SM_SLASH_REM(d1.lo, d1.hi, n1, n2, n3);
+if (((DHI(d1)^n1)<0) && n2!=0) {
+  n3--;
+  n2+=n1;
+}
+#else /* !defined(ASM_SM_SLASH_REM) */
 DCell r = fmdiv(d1,n1);
 n2=DHI(r);
 n3=DLO(r);
+#endif /* !defined(ASM_SM_SLASH_REM) */
 #else
+#ifdef ASM_SM_SLASH_REM4
+ASM_SM_SLASH_REM4(d1, n1, n2, n3);
+if (((DHI(d1)^n1)<0) && n2!=0) {
+  n3--;
+  n2+=n1;
+}
+#else /* !defined(ASM_SM_SLASH_REM4) */
 /* assumes that the processor uses either floored or symmetric division */
 n3 = d1/n1;
 n2 = d1%n1;
 /* note that this 1%-3>0 is optimized by the compiler */
-if (1%-3>0 && (d1<0) != (n1<0) && n2!=0) {
+if (1%-3>0 && ((DHI(d1)^n1)<0) && n2!=0) {
   n3--;
   n2+=n1;
 }
+#endif /* !defined(ASM_SM_SLASH_REM4) */
 #endif
 :
  dup >r dup 0< IF  negate >r dnegate r>  THEN
@@ -920,18 +919,26 @@ if (1%-3>0 && (d1<0) != (n1<0) && n2!=0)
 sm/rem	( d1 n1 -- n2 n3 )		core		s_m_slash_rem
 ""Symmetric division: @i{d1} = @i{n3}*@i{n1}+@i{n2}, sign(@i{n2})=sign(@i{d1}) or 0.""
 #ifdef BUGGY_LL_DIV
+#ifdef ASM_SM_SLASH_REM
+ASM_SM_SLASH_REM(d1.lo, d1.hi, n1, n2, n3);
+#else /* !defined(ASM_SM_SLASH_REM) */
 DCell r = smdiv(d1,n1);
 n2=DHI(r);
 n3=DLO(r);
+#endif /* !defined(ASM_SM_SLASH_REM) */
 #else
+#ifdef ASM_SM_SLASH_REM4
+ASM_SM_SLASH_REM4(d1, n1, n2, n3);
+#else /* !defined(ASM_SM_SLASH_REM4) */
 /* assumes that the processor uses either floored or symmetric division */
 n3 = d1/n1;
 n2 = d1%n1;
 /* note that this 1%-3<0 is optimized by the compiler */
-if (1%-3<0 && (d1<0) != (n1<0) && n2!=0) {
+if (1%-3<0 && ((DHI(d1)^n1)<0) && n2!=0) {
   n3++;
   n2-=n1;
 }
+#endif /* !defined(ASM_SM_SLASH_REM4) */
 #endif
 :
  over >r dup >r abs -rot
@@ -969,12 +976,20 @@ ud = (UDCell)u1 * (UDCell)u2;
 um/mod	( ud u1 -- u2 u3 )		core	u_m_slash_mod
 ""ud=u3*u1+u2, u1>u2>=0""
 #ifdef BUGGY_LL_DIV
+#ifdef ASM_UM_SLASH_MOD
+ASM_UM_SLASH_MOD(ud.lo, ud.hi, u1, u2, u3);
+#else /* !defined(ASM_UM_SLASH_MOD) */
 UDCell r = umdiv(ud,u1);
 u2=DHI(r);
 u3=DLO(r);
+#endif /* !defined(ASM_UM_SLASH_MOD) */
 #else
+#ifdef ASM_UM_SLASH_MOD4
+ASM_UM_SLASH_MOD4(ud, u1, u2, u3);
+#else /* !defined(ASM_UM_SLASH_MOD4) */
 u3 = ud/u1;
 u2 = ud%u1;
+#endif /* !defined(ASM_UM_SLASH_MOD4) */
 #endif
 :
    0 swap [ 8 cells 1 + ] literal 0
@@ -1214,17 +1229,16 @@ useraddr	( #u -- a_addr )	new
 a_addr = (Cell *)(up+u);
 
 up!	( a_addr -- )	gforth	up_store
-UP=up=(char *)a_addr;
+gforth_UP=up=(char *)a_addr;
 :
  up ! ;
 Variable UP
 
-sp@	( -- a_addr )		gforth		sp_fetch
-a_addr = sp+1;
+sp@	( S:... -- a_addr )		gforth		sp_fetch
+a_addr = sp;
 
-sp!	( a_addr -- )		gforth		sp_store
+sp!	( a_addr -- S:... )		gforth		sp_store
 sp = a_addr;
-/* works with and without spTOS caching */
 
 rp@	( -- a_addr )		gforth		rp_fetch
 a_addr = rp;
@@ -1234,10 +1248,10 @@ rp = a_addr;
 
 \+floating
 
-fp@	( -- f_addr )	gforth	fp_fetch
+fp@	( f:... -- f_addr )	gforth	fp_fetch
 f_addr = fp;
 
-fp!	( f_addr -- )	gforth	fp_store
+fp!	( f_addr -- f:... )	gforth	fp_store
 fp = f_addr;
 
 \+
@@ -1310,21 +1324,18 @@ tuck	( w1 w2 -- w2 w1 w2 )	core-ext
 :
  swap over ;
 
-?dup	( w -- w )			core	question_dupe
+?dup	( w -- S:... w )	core	question_dupe
 ""Actually the stack effect is: @code{( w -- 0 | w w )}.  It performs a
 @code{dup} if w is nonzero.""
 if (w!=0) {
-  IF_spTOS(*sp-- = w;)
-#ifndef USE_TOS
   *--sp = w;
-#endif
 }
 :
  dup IF dup THEN ;
 
-pick	( u -- w )			core-ext
+pick	( S:... u -- S:... w )		core-ext
 ""Actually the stack effect is @code{ x0 ... xu u -- x0 ... xu x0 }.""
-w = sp[u+1];
+w = sp[u];
 :
  1+ cells sp@ + @ ;
 
@@ -1634,7 +1645,7 @@ n = key((FILE*)wfileid);
 n = key(stdin);
 #endif
 
-key?-file	( wfileid -- n )		facility	key_q_file
+key?-file	( wfileid -- n )	        gforth	key_q_file
 #ifdef HAS_FILE
 fflush(stdout);
 n = key_query((FILE*)wfileid);
@@ -1771,20 +1782,17 @@ strsignal	( n -- c_addr u )	gforth
 c_addr = (Address)strsignal(n);
 u = strlen(c_addr);
 
-call-c	( w -- )	gforth	call_c
+call-c	( ... w -- ... )	gforth	call_c
 ""Call the C function pointed to by @i{w}. The C function has to
 access the stack itself. The stack pointers are exported in the global
 variables @code{SP} and @code{FP}.""
 /* This is a first attempt at support for calls to C. This may change in
    the future */
-IF_fpTOS(fp[0]=fpTOS);
-FP=fp;
-SP=sp;
+gforth_FP=fp;
+gforth_SP=sp;
 ((void (*)())w)();
-sp=SP;
-fp=FP;
-IF_spTOS(spTOS=sp[0]);
-IF_fpTOS(fpTOS=fp[0]);
+sp=gforth_SP;
+fp=gforth_FP;
 
 \+
 \+file
@@ -1994,6 +2002,9 @@ dsystem = DZERO;
 comparisons(f, r1 r2, f_, r1, r2, gforth, gforth, float, gforth)
 comparisons(f0, r, f_zero_, r, 0., float, gforth, float, gforth)
 
+s>f	( n -- r )		float	s_to_f
+r = n;
+
 d>f	( d -- r )		float	d_to_f
 #ifdef BUGGY_LL_D2F
 extern double ldexp(double x, int exp);
@@ -2014,6 +2025,9 @@ f>d	( r -- d )		float	f_to_d
 extern DCell double2ll(Float r);
 d = double2ll(r);
 
+f>s	( r -- n )		float	f_to_s
+n = (Cell)r;
+
 f!	( r f_addr -- )	float	f_store
 ""Store @i{r} into the float at address @i{f-addr}.""
 *f_addr = r;
@@ -2072,6 +2086,18 @@ f**	( r1 r2 -- r3 )	float-ext	f_star_sta
 ""@i{r3} is @i{r1} raised to the @i{r2}th power.""
 r3 = pow(r1,r2);
 
+fm*	( r1 n -- r2 )	gforth	fm_star
+r2 = r1*n;
+
+fm/	( r1 n -- r2 )	gforth	fm_slash
+r2 = r1/n;
+
+fm*/	( r1 n1 n2 -- r2 )	gforth	fm_star_slash
+r2 = (r1*n1)/n2;
+
+f**2	( r1 -- r2 )	gforth	fm_square
+r2 = r1*r1;
+
 fnegate	( r1 -- r2 )	float	f_negate
 r2 = - r1;
 
@@ -2130,10 +2156,13 @@ f2=FLAG(isdigit((unsigned)(sig[0]))!=0);
 siglen=strlen(sig);
 if (siglen>u) /* happens in glibc-2.1.3 if 999.. is rounded up */
   siglen=u;
+if (!f2) /* workaround Cygwin trailing 0s for Inf and Nan */
+  for (; sig[siglen-1]=='0'; siglen--);
+    ;
 memcpy(c_addr,sig,siglen);
 memset(c_addr+siglen,f2?'0':' ',u-siglen);
 
->float	( c_addr u -- flag )	float	to_float
+>float	( c_addr u -- f:... flag )	float	to_float
 ""Actual stack effect: ( c_addr u -- r t | f ).  Attempt to convert the
 character string @i{c-addr u} to internal floating-point
 representation. If the string represents a valid floating-point number
@@ -2143,9 +2172,8 @@ case and represents the floating-point n
 Float r;
 flag = to_float(c_addr, u, &r);
 if (flag) {
-  IF_fpTOS(fp[0] = fpTOS);
-  fp += -1;
-  fpTOS = r;
+  fp--;
+  fp[0]=r;
 }
 
 fabs	( r1 -- r2 )	float-ext	f_abs
@@ -2371,9 +2399,9 @@ f>l	( r -- )	gforth	f_to_l
 lp -= sizeof(Float);
 *(Float *)lp = r;
 
-fpick	( u -- r )		gforth
+fpick	( f:... u -- f:... r )		gforth
 ""Actually the stack effect is @code{ r0 ... ru u -- r0 ... ru r0 }.""
-r = fp[u+1]; /* +1, because update of fp happens before this fragment */
+r = fp[u];
 :
  floats fp@ + f@ ;
 
@@ -2411,13 +2439,34 @@ u3 = 0;
 #  endif
 #endif
 
-wcall	( u -- )	gforth
-IF_fpTOS(fp[0]=fpTOS);
-FP=fp;
-sp=(Cell*)(SYSCALL(Cell*(*)(Cell *, void *))u)(sp, &FP);
-fp=FP;
-IF_spTOS(spTOS=sp[0];)
-IF_fpTOS(fpTOS=fp[0]);
+wcall	( ... u -- ... )	gforth
+gforth_FP=fp;
+sp=(Cell*)(SYSCALL(Cell*(*)(Cell *, void *))u)(sp, &gforth_FP);
+fp=gforth_FP;
+
+uw@ ( c_addr -- u )	gforth u_w_fetch
+""@i{u} is the zero-extended 16-bit value stored at @i{c_addr}.""
+u = *(UWyde*)(c_addr);
+
+sw@ ( c_addr -- n )	gforth s_w_fetch
+""@i{n} is the sign-extended 16-bit value stored at @i{c_addr}.""
+n = *(Wyde*)(c_addr);
+
+w! ( w c_addr -- )	gforth w_store
+""Store the bottom 16 bits of @i{w} at @i{c_addr}.""
+*(Wyde*)(c_addr) = w;
+
+ul@ ( c_addr -- u )	gforth u_l_fetch
+""@i{u} is the zero-extended 32-bit value stored at @i{c_addr}.""
+u = *(UTetrabyte*)(c_addr);
+
+sl@ ( c_addr -- n )	gforth s_l_fetch
+""@i{n} is the sign-extended 32-bit value stored at @i{c_addr}.""
+n = *(Tetrabyte*)(c_addr);
+
+l! ( w c_addr -- )	gforth l_store
+""Store the bottom 32 bits of @i{w} at @i{c_addr}.""
+*(Tetrabyte*)(c_addr) = w;
 
 \+FFCALL
 
@@ -2481,30 +2530,30 @@ av_longlong(alist, d);
 av-ptr-r	( R:c_addr -- )	gforth  av_ptr_r
 av_ptr(alist, void*, c_addr);
 
-av-call-void	( -- )	gforth  av_call_void
+av-call-void	( ... -- ... )	gforth  av_call_void
 SAVE_REGS
 av_call(alist);
 REST_REGS
 
-av-call-int	( -- w )	gforth  av_call_int
+av-call-int	( ... -- ... w )	gforth  av_call_int
 SAVE_REGS
 av_call(alist);
 REST_REGS
 w = irv;
 
-av-call-float	( -- r )	gforth  av_call_float
+av-call-float	( ... -- ... r )	gforth  av_call_float
 SAVE_REGS
 av_call(alist);
 REST_REGS
 r = frv;
 
-av-call-double	( -- r )	gforth  av_call_double
+av-call-double	( ... -- ... r )	gforth  av_call_double
 SAVE_REGS
 av_call(alist);
 REST_REGS
 r = drv;
 
-av-call-longlong	( -- d )	gforth  av_call_longlong
+av-call-longlong	( ... -- ... d )	gforth  av_call_longlong
 SAVE_REGS
 av_call(alist);
 REST_REGS
@@ -2515,14 +2564,14 @@ DHI_IS(d, 0);
 d = llrv;
 #endif
 
-av-call-ptr	( -- c_addr )	gforth  av_call_ptr
+av-call-ptr	( ... -- ... c_addr )	gforth  av_call_ptr
 SAVE_REGS
 av_call(alist);
 REST_REGS
 c_addr = prv;
 
 alloc-callback	( a_ip -- c_addr )	gforth	alloc_callback
-c_addr = (char *)alloc_callback(engine_callback, (Xt *)a_ip);
+c_addr = (char *)alloc_callback(gforth_callback, (Xt *)a_ip);
 
 va-start-void	( -- )	gforth	va_start_void
 va_start_void(clist);
@@ -2592,6 +2641,99 @@ return 0;
 
 \+
 
+\+LIBFFI
+
+ffi-type ( n -- a_type )	gforth ffi_type
+static void* ffi_types[] =
+    { &ffi_type_void,
+      &ffi_type_uint8, &ffi_type_sint8,
+      &ffi_type_uint16, &ffi_type_sint16,
+      &ffi_type_uint32, &ffi_type_sint32,
+      &ffi_type_uint64, &ffi_type_sint64,
+      &ffi_type_float, &ffi_type_double, &ffi_type_longdouble,
+      &ffi_type_pointer };
+a_type = ffi_types[n];
+
+ffi-size ( n1 -- n2 )	gforth ffi_size
+static int ffi_sizes[] =
+    { sizeof(ffi_cif), sizeof(ffi_closure) };
+n2 = ffi_sizes[n1];
+
+ffi-prep-cif ( a_atypes n a_rtype a_cif -- w )	gforth ffi_prep_cif
+w = ffi_prep_cif(a_cif, FFI_DEFAULT_ABI, n, a_rtype, a_atypes);
+
+ffi-call ( a_avalues a_rvalue a_ip a_cif -- )	gforth ffi_call
+SAVE_REGS
+ffi_call(a_cif, a_ip, a_rvalue, a_avalues);
+REST_REGS
+
+ffi-prep-closure ( a_ip a_cif a_closure -- w )	gforth ffi_prep_closure
+w = ffi_prep_closure(a_closure, a_cif, gforth_callback, a_ip);
+
+ffi-2@ ( a_addr -- d )	gforth ffi_2fetch
+#ifdef BUGGY_LONG_LONG
+DLO_IS(d, (Cell*)(*a_addr));
+DHI_IS(d, 0);
+#else
+d = *(DCell*)(a_addr);
+#endif
+
+ffi-2! ( d a_addr -- )	gforth ffi_2store
+#ifdef BUGGY_LONG_LONG
+*(Cell*)(a_addr) = DLO(d);
+#else
+*(DCell*)(a_addr) = d;
+#endif
+
+ffi-arg-int ( -- w )	gforth ffi_arg_int
+w = *(int *)(*clist++);
+
+ffi-arg-longlong ( -- d )	gforth ffi_arg_longlong
+#ifdef BUGGY_LONG_LONG
+DLO_IS(d, (Cell*)(*clist++));
+DHI_IS(d, 0);
+#else
+d = *(DCell*)(*clist++);
+#endif
+
+ffi-arg-ptr ( -- c_addr )	gforth ffi_arg_ptr
+c_addr = *(char **)(*clist++);
+
+ffi-arg-float ( -- r )	gforth ffi_arg_float
+r = *(float*)(*clist++);
+
+ffi-arg-double ( -- r )	gforth ffi_arg_double
+r = *(double*)(*clist++);
+
+ffi-ret-void ( -- )	gforth ffi_ret_void
+return 0;
+
+ffi-ret-int ( w -- )	gforth ffi_ret_int
+*(int*)(ritem) = w;
+return 0;
+
+ffi-ret-longlong ( d -- )	gforth ffi_ret_longlong
+#ifdef BUGGY_LONG_LONG
+*(Cell*)(ritem) = DLO(d);
+#else
+*(DCell*)(ritem) = d;
+#endif
+return 0;
+
+ffi-ret-ptr ( c_addr -- )	gforth ffi_ret_ptr
+*(char **)(ritem) = c_addr;
+return 0;
+
+ffi-ret-float ( r -- )	gforth ffi_ret_float
+*(float*)(ritem) = r;
+return 0;
+
+ffi-ret-double ( r -- )	gforth ffi_ret_double
+*(double*)(ritem) = r;
+return 0;
+
+\+
+
 \+OLDCALL
 
 define(`uploop',
@@ -2641,13 +2783,12 @@ compile-prim1 ( a_prim -- ) gforth compi
 ""compile prim (incl. immargs) at @var{a_prim}""
 compile_prim1(a_prim);
 
-finish-code ( -- ) gforth finish_code
+finish-code ( ... -- ... ) gforth finish_code
 ""Perform delayed steps in code generation (branch resolution, I-cache
 flushing).""
-IF_spTOS(sp[0]=spTOS); /* workaround for failing to save spTOS
-			  (gcc-2.95.1, gforth-fast --enable-force-reg) */
+/* The ... above are a workaround for a bug in gcc-2.95, which fails
+   to save spTOS (gforth-fast --enable-force-reg) */
 finish_code();
-IF_spTOS(spTOS=sp[0]);
 
 forget-dyncode ( c_code -- f ) gforth-internal forget_dyncode
 f = forget_dyncode(c_code);
@@ -2682,7 +2823,7 @@ a_addr = groups;
 
 \g static_super
 
-ifdef(`M4_ENGINE_FAST',
+ifdef(`STACK_CACHE_FILE',
 `include(peeprules.vmg)')
 
 \g end