/* This is a version of generate specialized for ordinary life (i.e.,
   the 23/3 rule). It uses an idea from: "Smalltalk-80, the Language
   and its Implementation".  It adds the cells and its neighbours in
   parallel (32 at a time), representing the sum in several words,
   where every word contains one bit of the 32 sums. This addition is
   performed using bitwise operations (&,|,^,~).

   The book proposed using bitblt operations on a direct
   representation. This limits the size and is slow. I implemented
   this on a C64 (1MHz 6502) and got 1/2 generation/s for 256*200
   cells.

   Fortunately, this idea can also be applied to the data structure of
   xlife. However, for the highest speed we need to specialize it for
   every rule. For not implemented rules one can fall back on xlifes
   original algorithm, which is quite fast in itself.

   - anton
   anton@mips.complang.tuwien.ac.at */

#define NDEBUG
/* turns off assertions. also turns off visual pollution caused by
   premature killing */
#include <assert.h>
#include "defs.h"
#include "cellbox.h"

void generate_23_3()
{
  cellbox *cptr,*tptr,*cptrup,*cptrdn,*cptrlf,*cptrrt;
#ifdef PROF
  static int count_pass2=0;
  static int count_half1=0;
  static int count_half2=0;
  static int count_up=0;
  static int count_lf=0;
  static int count_rt=0;
  static int count_dn=0;
#endif PROF

#ifdef PROF    
  link_called = link_search = 0;
  create_called = create_null = 0;
#endif PROF
  generations++;
  numcells=0;
  for (cptr = head; cptr!=NULL; cptr = cptr->next) {
    /* first pass: just copy to olive and create neighbours, if necessary */
    u_long live1 = cptr->live1;
    u_long live2 = cptr->live2;
      
    cptr->olive1 = live1;
    cptr->olive2 = live2;
    cptrup=cptr->up;
    cptrdn=cptr->dn;
    cptrlf=cptr->lf;
    cptrrt=cptr->rt;
    if (cptrup==NULL && (live1&0xff)!=0) {
      cptrup=cptr->up=link1(cptr->x,cptr->y - 8);
      cptrup->dn=cptr;
    }
    if (cptrdn==NULL && (live2&0xff000000)!=0) {
      cptrdn=cptr->dn=link1(cptr->x,cptr->y + 8);
      cptrdn->up=cptr;
    }
    if ((live1&0x01010101)!=0) {
      if (cptrlf==NULL) {
	cptrlf=cptr->lf=link1(cptr->x - 8,cptr->y);
	cptrlf->rt=cptr;
      }
      if ((live1&0x00000001)!=0 && cptrlf->up==NULL) {
	cellbox *cptrul=link1(cptr->x - 8,cptr->y - 8);
	cptrlf->up=cptrul;
	cptrul->dn=cptrlf;
	cptrup->lf=cptrul;
	cptrul->rt=cptrup;
      }
    }  
    if ((live2&0x01010101)!=0) {
      if (cptrlf==NULL) {
	cptrlf=cptr->lf=link1(cptr->x - 8,cptr->y);
	cptrlf->rt=cptr;
      }
      if ((live2&0x01000000)!=0 && cptrlf->dn==NULL){
	cellbox *cptrdl=link1(cptr->x - 8,cptr->y + 8);
	cptrlf->dn=cptrdl;
	cptrdl->up=cptrlf;
	cptrdn->lf=cptrdl;
	cptrdl->rt=cptrdn;
      }
    }
    if ((live1&0x80808080)!=0) {
      if(cptrrt == NULL){
	cptrrt=cptr->rt=link1(cptr->x + 8,cptr->y);
	cptrrt->lf=cptr;
      }
      if ((live1&0x00000080)!=0 && cptrrt->up==NULL){
	cellbox *cptrur=link1(cptr->x + 8,cptr->y - 8);
	cptrrt->up=cptrur;
	cptrur->dn=cptrrt;
	cptrup->rt=cptrur;
	cptrur->lf=cptrup;
      }
    }
    if ((live2&0x80808080)!=0) {
      if(cptrrt == NULL){
	cptrrt=cptr->rt=link1(cptr->x + 8,cptr->y);
	cptrrt->lf=cptr;
      }
      if((live2&0x80000000)!=0 && cptrrt->dn==NULL) {
	cellbox *cptrdr=link1(cptr->x + 8,cptr->y + 8);
	cptrrt->dn=cptrdr;
	cptrdr->up=cptrrt;
	cptrdn->rt=cptrdr;
	cptrdr->lf=cptrdn;
      }
    }
  }
  for (cptr = head; cptr!=NULL; cptr=cptr->next) {
    /* the two half-blocks and their neighbouring half-blocks: */
    u_long live1, live2, ul2, u2, ur2, l1, r1, l2, r2, dl1, d1, dr1;
    /* the cell-neighbours of the current half-block. I.e., the first
       bit of uln is the upper left neighbour cell of the first bit of
       the present half-block */
    u_long upn, dnn, lfn, rtn, uln, urn, dln, drn;
    /* the bits of the sum of the neighbours; bits 2 and 3 are ored,
       since this is all we need for normal life */
    u_long bit0, bit1, bit23;
    /* second pass: now the real fun starts */
    /* these assertions just check that the first pass did its job we
       rely on that in this pass. */
    assert(cptr->up!=NULL || 
	   (link1(cptr->x,cptr->y - 8)->olive2 & 0xff000000)==0);
    assert(cptr->dn!=NULL || 
	   (link1(cptr->x,cptr->y + 8)->olive1 & 0x000000ff)==0);
    assert(cptr->lf!=NULL || 
	   ((link1(cptr->x - 8,cptr->y)->olive1 & 0x80808080)==0 &&
	    (link1(cptr->x - 8,cptr->y)->olive2 & 0x80808080)==0));
    assert(cptr->rt!=NULL || 
	   ((link1(cptr->x + 8,cptr->y)->olive1 & 0x01010101)==0 &&
	    (link1(cptr->x + 8,cptr->y)->olive2 & 0x01010101)==0));
    assert((cptr->up!=NULL && cptr->up->lf!=NULL) || 
	   (link1(cptr->x-8,cptr->y-8)->olive2 & 0x80000000)==0);
    assert((cptr->up!=NULL && cptr->up->rt!=NULL) || 
	   (link1(cptr->x+8,cptr->y-8)->olive2 & 0x01000000)==0);
    assert((cptr->dn!=NULL && cptr->dn->lf!=NULL) || 
	   (link1(cptr->x-8,cptr->y+8)->olive1 & 0x00000080)==0);
    assert((cptr->dn!=NULL && cptr->dn->rt!=NULL) || 
	   (link1(cptr->x+8,cptr->y+8)->olive1 & 0x00000001)==0);
    /* some statistical information from runs using breeder, primes,
       oscspn with maxdead=2: count_up/dn/lf/rt etc. is pretty stable
       at 60-64% of pass2 each. And half1 and half2 are at 74-79% */
    /* compute live1 and live2 of the neighbours; mask everything away
       that is not needed for computing the current block; maybe even
       do a shift that may be needed later */
    /* the following code is a bit convoluted to help the register
       allocator */
#ifdef PROF
    count_pass2++;
#endif PROF
    live1=cptr->olive1;
    live2=cptr->olive2;
    if (cptr->lf!=NULL) {
#ifdef PROF
      count_lf++;
#endif PROF
      l1=cptr->lf->olive1 & 0x80808080;
      l2=cptr->lf->olive2 & 0x80808080;
    } else {
      l1=0;
      l2=0;
    }
    if (cptr->rt!=NULL) {
#ifdef PROF
      count_rt++;
#endif PROF
      r1=cptr->rt->olive1 & 0x01010101;
      r2=cptr->rt->olive2 & 0x01010101;
    } else {
      r1=0;
      r2=0;
    }

    /* compute neighbours of live1 */
    /* if everything is 0, no need to compute. The corners need not be
       checked, because they not make a difference if they are the
       only live thing. This check is only profitable if there is a
       significant number of useless halfblocks. */
    if (live1!=0 || (live2&0xff)!=0 || l1!=0 || r1!=0 || 
	(cptr->up!=NULL && (cptr->up->olive2&0xff000000)!=0)) {
#ifdef PROF
      count_half1++;
#endif PROF
      if (cptr->up!=NULL) {
#ifdef PROF
	count_up++;
#endif PROF
	u2=cptr->up->olive2;
	ul2= cptr->up->lf!=NULL ? cptr->up->lf->olive2>>31 : 0;
	ur2= cptr->up->rt!=NULL ? (cptr->up->rt->olive2&0x01000000)>>17 : 0;
      } else {
	u2=0;
	/* the following is not quite obvious. But if the upper left
	   block had its lower right cell set, there would be an up
	   block (also expressed in the assertion for
	   cptr->up->lf). Conversely, since there is no up block, there
	   cannot be a cell set there. */
	ul2=0;
	ur2=0;
      }
      /* compute the sum in lockstep with the neighbouring
         cells. bit23 represents an or of bit 2 and 3 of the sum. For
         normal life we don't need to know more than that one of these
         bits is set */
      lfn=(l1>>7) | ((live1&0x7f7f7f7f)<<1);
      uln= ul2 | ((u2&0x7f7f7f7f)>>23) | (lfn<<8);
      bit1=uln&lfn; bit0=uln^lfn;
      dln=(lfn>>8) | (l2<<17) | (live2<<25);
      bit1^=bit0&dln;  bit0^=dln;
      rtn=(r1<<7) | ((live1&0xfefefefe)>>1);
      bit23=bit1&(bit0&rtn); bit1^=bit0&rtn; bit0^=rtn;
      urn= ur2 | (u2>>25) | (rtn<<8);
      bit23|=bit1&(bit0&urn); bit1^=bit0&urn; bit0^=urn;
      drn=(rtn>>8) | ((live2&0xfefefefe)<<23) | (r2<<31);
      bit23|=bit1&(bit0&drn); bit1^=bit0&drn; bit0^=drn;
      upn=(live1<<8)|(u2>>24);
      bit23|=bit1&(bit0&upn); bit1^=bit0&upn; bit0^=upn;
      dnn=(live1>>8)|(live2<<24);
      bit23|=bit1&(bit0&dnn); bit1^=bit0&dnn; bit0^=dnn;
      /* now compute the new live cells. This is a little tricky, but
         you see that it works when you build the truth table. */
      cptr->live1 = (~bit23)&bit1&(live1|bit0);
      /* we can look at the whole thing as a boolean function with 9
         arguments. Currently it is evaluated using 13 xors, 1 not, 5
         ors and (after common subexpression elimination) 13 ands. Can
         it be done cheaper? */
    }
    /* this is analogous to the first half-box */
    if (live2!=0 || (live1>>24)!=0 || l2!=0 || r2!=0 || 
	(cptr->dn!=NULL && (cptr->dn->olive1<<24)!=0)) {
#ifdef PROF
      count_half2++;
#endif PROF
      if (cptr->dn!=NULL) {
#ifdef PROF
	count_dn++;
#endif PROF
	d1=cptr->dn->olive1;
	dl1=cptr->dn->lf!=NULL ? (cptr->dn->lf->olive1&0x80808080)<<17 : 0;
	dr1=cptr->dn->rt!=NULL ? cptr->dn->rt->olive1<<31 : 0;
      } else {
	d1=0;
	dl1=0;
	dr1=0;
      }
      lfn=(l2>>7) | ((live2&0x7f7f7f7f)<<1);
      dln=(lfn>>8) | dl1 | (d1<<25);
      bit1=dln&lfn; bit0=dln^lfn;
      uln=(l1>>31) | ((live1&0x7f7f7f7f)>>23) | (lfn<<8);
      bit1^=bit0&uln;  bit0^=uln;
      rtn=(r2<<7) | ((live2&0xfefefefe)>>1);
      bit23=bit1&(bit0&rtn); bit1^=bit0&rtn; bit0^=rtn;
      drn=(rtn>>8) | ((d1&0xfefefefe)<<23) | dr1;
      bit23|=bit1&(bit0&drn); bit1^=bit0&drn; bit0^=drn;
      urn=(r1>>17) | (live1>>25) | (rtn<<8);
      bit23|=bit1&(bit0&urn); bit1^=bit0&urn; bit0^=urn;
      dnn=(live2>>8)|(d1<<24);
      bit23|=bit1&(bit0&dnn); bit1^=bit0&dnn; bit0^=dnn;
      upn=(live2<<8)|(live1>>24);
      bit23|=bit1&(bit0&upn); bit1^=bit0&upn; bit0^=upn;
      cptr->live2 = (~bit23)&bit1&(live2|bit0);
    }

    if(dispcells){
      u_long bits;
      bits=cptr->live1;
      bits = (bits & 0x55555555) + ((bits>>1)&0x55555555);
      bits = (bits & 0x33333333) + ((bits>>2)&0x33333333);
      bits = (bits & 0x0f0f0f0f) + ((bits>>4)&0x0f0f0f0f);
      bits = (bits & 0x00ff00ff) + ((bits>>8)&0x00ff00ff);
      bits = (bits & 0x0000ffff) + (bits>>16);
      numcells+=bits;
      bits=cptr->live2;
      bits = (bits & 0x55555555) + ((bits>>1)&0x55555555);
      bits = (bits & 0x33333333) + ((bits>>2)&0x33333333);
      bits = (bits & 0x0f0f0f0f) + ((bits>>4)&0x0f0f0f0f);
      bits = (bits & 0x00ff00ff) + ((bits>>8)&0x00ff00ff);
      bits = (bits & 0x0000ffff) + (bits>>16);
      numcells+=bits;
    }	
  }
  /* we cannot do this in the second pass because the kills would
     destroy the assertions. however, we need not do it every
     generation (at least the killing) */
  for (cptr = head; cptr!=NULL;) {
    if(cptr->live1 || cptr->live2){
      cptr->dead=0;
      cptr=cptr->next;
    } else {
#ifdef NDEBUG
      cptr->dead++;
      if(cptr->dead > maxdead){
	tptr=cptr->next;
	kill(cptr);
	cptr=tptr;
      }
      else{
	cptr=cptr->next;
      }
#else
      /* kill everything dead, otherwise the number of blocks explodes
         due to the assertions */
      tptr=cptr->next;
      kill(cptr);
      cptr=tptr;
#endif
    }
  }
#ifdef PROF
  printf("num=%d ",numboxes);
  if(link_called){
    printf("l_c=%d ",link_called);
    if(link_search){
      printf(" l_ave=%f ",link_search/(float)link_called);
    }
  }
  if(create_called){
    printf("c_c=%d ",create_called);
    if(create_null){
      printf(" c_ave=%f ",create_null/(float)create_called);
    }
  }
  printf("pass2: %d, half1: %d, half2: %d, up:%d dn:%d lf:%d rt:%d",count_pass2, count_half1, count_half2, count_up, count_dn, count_lf, count_rt);
  printf("\n");
#endif PROF
}      

