/* Simple implementation where every malloc()ed object has exactly one reference */ #include #include "vectors.h" #ifdef DEBUG #define DBG(x) x #else #define DBG(x) #endif //Optimizations #define OPT__VECT_STACK #define OPT__NODE_STACK #define OPT__ADD_FAC_SCAL #define OPT__VECT_REUSE_ADD_FAC_SCAL #define OPT__VECT_REUSE_VSUM #define OPT__VECT_REUSE_VPROD #define OPT__NO_N_CHECK_LAZY #define OPT__NO_N_CHECK_REAL #define OPT__NO_N_CHECK_ADD_FAC_SCAL #define OPT__MOD4IS0_VSUM #define OPT__MOD4IS0_VPROD #define OPT__MOD4IS0_SVPROD #define OPT__MOD4IS0_ADD_FAC_SCAL #define OPT__ASSUME_NO_VECTOR_OVERWRITE static VPriv vcopy_real(Vector v); static VPriv vsum_real(VPriv v1, VPriv v2); static VPriv vprod_real(VPriv v1, VPriv v2); static VPriv svprod_real(double d, VPriv v1); static VPriv vcreate(size_t n); static VPriv veval(VPriv v); static inline struct vect_private* vect_malloc(size_t n); static inline void vect_free(struct vect_private* p); static inline struct vect_private* node_malloc(void); static inline void node_free(struct vect_private* p); enum operand_t { OP_FAIL=0, OP_VECT, OP_VSUM, OP_VPROD, OP_SVPROD }; struct vect_private { size_t n; //vector elemets in d[] size_t memsize; //malloced memsize size_t references; //reference counter size_t size_t_pad[1]; //padding to 32-byte alignment enum operand_t op; //information type identifier enum operand_t operand_t_pad[7]; //padding to 32-byte alignment struct vect_private* a; //vector operand a struct vect_private* b; //vector operand b struct vect_private* vect_private_ptr_pad[2]; //padding to 32-byte alignment double c; //scalar operand c double double_pad[3]; //padding to 32-byte alignment double d[4]; //vector data } __attribute__((aligned(32))); DBG( /*Performance Counters*/ //vector allocation size_t pc_vect_malloc=0; size_t pc_vect_malloc_stack=0; size_t pc_vect_malloc_nostack=0; size_t pc_vect_free=0; size_t pc_vect_free_zeroref=0; size_t pc_vect_free_stack=0; size_t pc_vect_free_nostack=0; //node allocation size_t pc_node_malloc=0; size_t pc_node_malloc_stack=0; size_t pc_node_malloc_nostack=0; size_t pc_node_free=0; size_t pc_node_free_zeroref=0; size_t pc_node_free_stack=0; size_t pc_node_free_nostack=0; ) VPriv vcopy(Vector v) { VPriv v1 = (VPriv)v; v1->references++; return (VPriv)v; } /* static VPriv vcopy_real(Vector v) { VPriv v1 = (VPriv)v; size_t vbytes = sizeof(struct vect_private)+v1->n*sizeof(double); VPriv r = vect_malloc(vbytes); memcpy(r,v1,vbytes); r->references = 1; return r; } */ VPriv vconsume1(Vector *vp) { VPriv r = (VPriv)*vp; *vp = NULL; return r; } void vdelete1(Vector *vp) { VPriv v1 = (VPriv)*vp; if (v1 != NULL) { vect_free(v1); vp = NULL; } } void vassign1(Vector *vp, VPriv v) { VPriv v1 = (VPriv)*vp; #ifndef OPT__ASSUME_NO_VECTOR_OVERWRITE if (v1 != NULL) vect_free(v1); #endif v = veval(v); *vp = (Vector)v; } /* Vector sum and product */ VPriv vsum(VPriv v1, VPriv v2) { #ifndef OPT_NO_N_CHECK_LAZY if (v1->n != v2->n) { fprintf(stderr,"vsum (lazy): vector size mismatch\n"); exit(1); } #endif VPriv v = node_malloc(); v->op = OP_VSUM; v->a = v1; v->b = v2; v->n = v1->n; return v; } static inline void vsum_real_h(double* const __restrict d1, double* const __restrict d2, const size_t n) { #ifdef OPT__MOD4IS0_VSUM for (size_t i=0; i<(n/4+1)*4; i++) { #else for (size_t i=0; in != v2->n) { fprintf(stderr,"vsum: vector size mismatch\n"); exit(1); } #endif /* We know that this operation consumes v1 and v2, so we can reuse the memory of v1 for the result */ #ifdef OPT__VECT_REUSE_VSUM if (v1->references==1) { vsum_real_h(v1->d,v2->d,v1->n); //for (i=0; in; i++) // v1->d[i] += v2->d[i]; vect_free(v2); return v1; } else if (v2->references==1) { vsum_real_h(v2->d,v1->d,v1->n); //for (i=0; in; i++) // v2->d[i] += v2->d[i]; vect_free(v1); return v2; } else #endif { VPriv v3 = vcreate(v1->n); for (i=0; in; i++) v3->d[i] = v1->d[i] + v2->d[i]; vect_free(v1); vect_free(v2); return v3; } } VPriv vprod(VPriv v1, VPriv v2) { #ifndef OPT__NO_N_CHECK_LAZY if (v1->n != v2->n) { fprintf(stderr,"vprod (lazy): vector size mismatch\n"); exit(1); } #endif VPriv v = node_malloc(); v->op = OP_VPROD; v->a = v1; v->b = v2; v->n = v1->n; return v; } static inline void vprod_real_h(double* const __restrict d1, double* const __restrict d2, const size_t n) { #ifdef OPT__MOD4IS0_VPROD for (size_t i=0; i<(n/4+1)*4; i++) { #else for (size_t i=0; in != v2->n) { fprintf(stderr,"vprod: vector size mismatch\n"); exit(1); } #endif /* We know that this operation consumes v1 and v2, so we can reuse the memory of v1 for the result */ #ifdef OPT__VECT_REUSE_VPROD if (v1->references==1) { vprod_real_h(v1->d,v2->d,v1->n); //for (i=0; in; i++) // v1->d[i] *= v2->d[i]; vect_free(v2); return v1; } else if (v2->references==1) { vprod_real_h(v2->d,v1->d,v1->n); //for (i=0; in; i++) // v2->d[i] *= v1->d[i]; vect_free(v1); return v2; } else #endif { VPriv v3 = vcreate(v1->n); for (i=0; in; i++) v3->d[i] = v1->d[i] * v2->d[i]; vect_free(v1); vect_free(v2); return v3; } } VPriv svprod(double d, VPriv v1) { VPriv v = node_malloc(); v->op = OP_SVPROD; v->a = v1; v->c = d; v->n = v1->n; return v; } static inline void svprod_real_h(const double d, double* const __restrict d1, const size_t n) { #ifdef OPT__MOD4IS0_SVPROD for (size_t i=0; i<(n/4+1)*4; i++) { #else for (size_t i=0; ireferences==1) { svprod_real_h(d,v1->d,v1->n); //for (i=0; in; i++) // v1->d[i] *= d; return v1; } else { VPriv v3 = vcreate(v1->n); for (i=0; in; i++) v3->d[i] = v1->d[i] * d; vect_free(v1); return v3; } } static VPriv vcreate(size_t n) { size_t vbytes = sizeof(struct vect_private)+n*sizeof(double); VPriv v = vect_malloc(vbytes); v->op = OP_VECT; v->n = n; v->memsize = vbytes; v->references = 1; return v; } /* Conversion between abstract vectors and concrete memory arrays */ /* Create a vector with n elements starting at dp */ VPriv vnew(double *dp, size_t n) { size_t vbytes = sizeof(struct vect_private)+n*sizeof(double); VPriv v = vect_malloc(vbytes); v->n = n; memcpy(v->d,dp,n*sizeof(double)); return v; } /* Store the elements from v at sp; The number of elements in v must be n */ void vstore(double *dp, size_t n, VPriv v) { if (v->n != n) { fprintf(stderr,"vstore: vector size mismatch\n"); exit(1); } memcpy(dp,v->d,n*sizeof(double)); vect_free(v); } /* =================================================================================== */ /* ==== ==== ==== ==== START of vect allocation functions ==== ==== ==== ==== */ /* =================================================================================== */ #ifndef OPT__VECT_STACK static inline struct vect_private* vect_malloc(size_t n) { DBG(pc_vect_malloc++;) struct vect_private* p; p = memalign(32,n); //p = malloc(n); p->memsize=n; p->references=1; p->op = OP_VECT; return p; } static inline void vect_free(struct vect_private* p) { DBG(pc_vect_free++;) p->references--; if (p->references==0) { DBG(pc_vect_free_zeroref++;) free(p); } } #else #define VECT_STACK_MAX (4096) struct vect_private* vect_stack[VECT_STACK_MAX]; struct vect_private** vect_stack_ptr = vect_stack; static inline struct vect_private* vect_malloc(size_t n) { DBG(pc_vect_malloc++;) struct vect_private* p; if (vect_stack_ptr>vect_stack && (*(vect_stack_ptr-1))->memsize>=n) { DBG(pc_vect_malloc_stack++); vect_stack_ptr--; p=*vect_stack_ptr; } else { DBG(pc_vect_malloc_nostack++;) p = memalign(32,n); //p = malloc(n); p->memsize=n; } p->references=1; p->op = OP_VECT; return p; } static inline void vect_free(struct vect_private* p) { DBG(pc_vect_free++;) p->references--; if (p->references==0) { DBG(pc_vect_free_zeroref++;) if (vect_stack_ptr-vect_stackmemsize=sizeof(struct vect_private); p->references=1; return p; } static inline void node_free(struct vect_private* p) { DBG(pc_node_free++;) p->references--; if (p->references==0) { DBG(pc_node_free_nostack++;) free(p); } } #else #define NODE_STACK_MAX (4096) struct vect_private* node_stack[NODE_STACK_MAX]; struct vect_private** node_stack_ptr = node_stack; static inline struct vect_private* node_malloc(void) { DBG(pc_node_malloc++;) struct vect_private* p; if (node_stack_ptr>node_stack) { DBG(pc_node_malloc_stack++;) node_stack_ptr--; p=*node_stack_ptr; } else { DBG(pc_node_malloc_nostack++;) p = malloc(sizeof(struct vect_private)); p->memsize=sizeof(struct vect_private); } p->references=1; return p; } static inline void node_free(struct vect_private* p) { DBG(pc_node_free++;) p->references--; if (p->references==0) { DBG(pc_node_free_zeroref++;) if (node_stack_ptr-node_stackop==OP_VSUM && v->a->op==OP_VECT && v->b->op==OP_SVPROD && v->b->a->op==OP_VECT) { VPriv r; #ifndef OPT__NO_N_CHECK_ADD_FAC_SCAL if (v->a->n != v->b->a->n) { fprintf(stderr, "add_fac_scal: vector size mistmatch\n"); exit(EXIT_FAILURE); } #endif #ifdef OPT__VECT_REUSE_ADD_FAC_SCAL if (v->a->references==1) { r = v->a; sum_fac_scal(v->a->d, v->b->a->d, v->b->c, v->a->n); vect_free(v->b->a); node_free(v->b); } else if (v->b->a->references==1) { r = v->b->a; sum_fac_scal(v->b->a->d, v->a->d, v->b->c, v->a->n); node_free(v->b); vect_free(v->a); } else #endif { r = vcreate(v->a->n); for (size_t i=0; ia->n; i++) { r->d[i]= v->a->d[i] + v->b->a->d[i] * v->b->c; } vect_free(v->b->a); vect_free(v->a); node_free(v->b); } node_free(v); r->op = OP_VECT; return r; } #endif VPriv vr = NULL; switch (v->op) { case OP_FAIL: fprintf(stderr, "Failed to evaluate Fail.\n"); exit(EXIT_FAILURE); case OP_VECT: return v; case OP_VSUM: vr = vsum_real(veval(v->a),veval(v->b)); node_free(v); vr->op = OP_VECT; return vr; case OP_VPROD: vr = vprod_real(veval(v->a),veval(v->b)); node_free(v); vr->op = OP_VECT; return vr; case OP_SVPROD: vr = svprod_real(v->c,veval(v->a)); node_free(v); vr->op = OP_VECT; return vr; default: fprintf(stderr,"Unrecognized Operation.\n"); exit(EXIT_FAILURE); } } DBG( void print_perf(void) { printf("List of Enabled Optimizations:\n"); #ifdef OPT__VECT_STACK printf("\tUse vector memory allocation stack\n"); #endif #ifdef OPT__NODE_STACK printf("\tUse lazy evaluation node allocation stack\n"); #endif #ifdef OPT__ADD_FAC_SCAL printf("\tAllow pattern evaluation for Va+Vb*c\n"); #endif #ifdef OPT__VECT_REUSE_ADD_FAC_SCAL printf("\tPossible re-use of operand vector memory in pattern evaluation for Va+Vb*c\n"); #endif #ifdef OPT__VECT_REUSE_VSUM printf("\tPossible re-use of operand vector memory in vector sum operation\n"); #endif #ifdef OPT__VECT_REUSE_VPROD printf("\tPossible re-use of operand vector memory in vector product operation\n"); #endif #ifdef OPT__NO_N_CHECK_LAZY printf("\tNo vector size comparison on lazy evaluated methods.\n"); #endif #ifdef OPT__NO_N_CHECK_REAL printf("\tNo vector size comparison on real evaluated methods.\n"); #endif #ifdef OPT__NO_N_CHECK_ADD_FAC_SCAL printf("\tNo vector size comparision in pattern evaluation for Va+Vb*c\n"); #endif #ifdef OPT__MOD4IS0_VSUM printf("\tPerform computations as multiple of 4 in vsum evaluation.\n"); #endif #ifdef OPT__MOD4IS0_VPROD printf("\tPerform computations as multiple of 4 in vprod evaluation.\n"); #endif #ifdef OPT__MOD4IS0_SVPROD printf("\tPerform computations as multiple of 4 in svprod evaluation.\n"); #endif #ifdef OPT__MOD4IS0_ADD_FAC_SCAL printf("\tPerform computations as multiple of 4 in pattern evaluation for Va+Vb*c.\n"); #endif #ifdef OPT__ASSUME_NO_VECTOR_OVERWRITE printf("\tNo check and free of overwritten vectors.\n"); #endif printf("\nPerformance Counters:\n"); printf("==== vect_malloc(): ====\n"); printf("\tvect_malloc(): called: %zu\n",pc_vect_malloc); printf("\t\tvect_malloc(): served from stack: %zu\n",pc_vect_malloc_stack); printf("\t\tvect_malloc(): true malloc: %zu\n", pc_vect_malloc_nostack); printf("==== vect_free(): ====\n"); printf("\tvect_free(): called: %zu\n",pc_vect_free); printf("\t\tvect_free(): zero references: %zu\n",pc_vect_free_zeroref); printf("\t\t\tvect_free(): deliverd to stack: %zu\n",pc_vect_free_stack); printf("\t\t\tvect_free(): true free: %zu\n",pc_vect_free_nostack); printf("==== node_malloc(): ====\n"); printf("\tnode_malloc(): called: %zu\n",pc_node_malloc); printf("\t\tnode_malloc(): served from stack: %zu\n",pc_node_malloc_stack); printf("\t\tnode_malloc(): true malloc: %zu\n", pc_node_malloc_nostack); printf("==== node_free(): ====\n"); printf("\tnode_free(): called: %zu\n",pc_node_free); printf("\t\tnode_free(): zero references: %zu\n",pc_node_free_zeroref); printf("\t\t\tnode_free(): deliverd to stack: %zu\n",pc_node_free_stack); printf("\t\t\tnode_free(): true free: %zu\n",pc_node_free_nostack); } )