-: 0:Source:tspi4.c -: 0:Graph:tspi4.gcno -: 0:Data:tspi4.gcda -: 0:Runs:1 -: 1:#include -: 2:#include -: 3:#include -: 4:#include -: 5:#include -: 6:#include -: 7:#include -: 8:#include -: 9: -: 10:typedef struct { -: 11: double x; -: 12: double y; -: 13:} point; -: 14: 19998: 15:double sqr(double x) -: 16:{ 19998: 17: return x*x; -: 18:} -: 19: 9999: 20:double dist(double x[], double y[], int i, int j) { 9999: 21: return sqrt(sqr(x[i]-x[j])+ 9999: 22: sqr(y[i]-y[j])); -: 23:} -: 24: #####: 25:double DistSqrd(point cities[], int i, int j) { #####: 26: return (sqr(cities[i].x-cities[j].x)+ #####: 27: sqr(cities[i].y-cities[j].y)); -: 28:} -: 29: 19998: 30:void swap(double *p, double *q) -: 31:{ 19998: 32: double tmp=*p; 19998: 33: *p = *q; 19998: 34: *q = tmp; 19998: 35:} -: 36: -: 37:static __inline -: 38:double _mm256_castpd256_sd(__m256d x) { -: 39: double y; -: 40: _mm_store_sd(&y, _mm256_castpd256_pd128(x)); -: 41: return y; -: 42:} -: 43: -: 44:// tourx and toury aligned on 32-byte boundary. -: 45:// Alignment is not necessary for correctness, but very important for execution speed -: 46:// Both tourx[] and toury[] should have space for 4 guard values at the end 1: 47:void tsp(point cities[], double tourx[], double toury[], int ncities) -: 48:{ -: 49: int i; 1: 50: tourx[0] = cities[ncities-1].x; 1: 51: toury[0] = cities[ncities-1].y; 10000: 52: for (i=1; i= ncities) -: 88: break; -: 89: min_k = k; -: 90: } -: 91: } 7500: 92: dist_va[0] = minDist; -: 93: } -: 94: 9999: 95: double *px = &tourx[k]; 9999: 96: double *py = &toury[k]; -: 97: __m256d minDist_v = _mm256_broadcast_sd(&dist_va[0]); 90182: 98: while (_mm256_castpd256_sd(minDist_v) != 0) // finish when found two cities in the same place -: 99: { -: 100: int le_msk; -: 101: do { -: 102: __m256d x = _mm256_loadu_pd(px); -: 103: __m256d y = _mm256_loadu_pd(py); -: 104: __m256d dx_v = _mm256_sub_pd(x, thisx_v); -: 105: __m256d dy_v = _mm256_sub_pd(y, thisy_v); 12504999: 106: _mm_prefetch(&px[40], 0); 12504999: 107: _mm_prefetch(&py[40], 0); 12504999: 108: px += 4; 12504999: 109: py += 4; -: 110: dist_v = _mm256_add_pd(_mm256_mul_pd(dx_v, dx_v),_mm256_mul_pd(dy_v, dy_v)); -: 111: __m256d le_v = _mm256_cmp_pd(dist_v, minDist_v, _CMP_LE_OQ); -: 112: le_msk = _mm256_movemask_pd(le_v); 12504999: 113: } while (le_msk == 0); -: 114: -: 115: _mm256_storeu_pd(dist_va, dist_v); 80183: 116: int prev_k = px - tourx - 4; -: 117: do { -: 118: #ifdef MSVC -: 119: long unsigned ki; -: 120: _BitScanForward(&ki, le_msk); -: 121: #else 87608: 122: int ki = __builtin_ctz(le_msk); -: 123: #endif 87608: 124: minDist_v = _mm256_broadcast_sd(&dist_va[ki]); -: 125: 87608: 126: int nxt_k = prev_k + ki; 87608: 127: if (nxt_k < ncities) -: 128: min_k = nxt_k; -: 129: -: 130: __m256d le_v = _mm256_cmp_pd(dist_v, minDist_v, _CMP_LT_OQ); // LT rather than LE, to guarantee a forward progress -: 131: le_msk = _mm256_movemask_pd(le_v); 87608: 132: } while (le_msk != 0); -: 133: } -: 134: 9999: 135: swap(&tourx[i],&tourx[min_k]); 9999: 136: swap(&toury[i],&toury[min_k]); -: 137: } 1: 138:} -: 139: 1: 140:int main(int argc, char *argv[]) -: 141:{ -: 142: int i, ncities; -: 143: point *cities; -: 144: double *tourx; -: 145: double *toury; -: 146: FILE *psfile; -: 147: double sumdist = 0.0; -: 148: 1: 149: if (argc!=2) { #####: 150: fprintf(stderr, "usage: %s ", argv[0]); #####: 151: exit(1); -: 152: } 1: 153: ncities = atoi(argv[1]); 1: 154: cities = alloca(ncities*sizeof(point)); 1: 155: tourx=alloca((ncities+4)*sizeof(double))+4*sizeof(double); 1: 156: toury=alloca(ncities*sizeof(double)); 10001: 157: for (i=0; i