#include #include #include #include #include #include #include #include typedef struct { double x; double y; } point; double sqr(double x) { return x*x; } double dist(double x[], double y[], int i, int j) { return sqrt(sqr(x[i]-x[j])+ sqr(y[i]-y[j])); } double DistSqrd(point cities[], int i, int j) { return (sqr(cities[i].x-cities[j].x)+ sqr(cities[i].y-cities[j].y)); } void swap(double *p, double *q) { double tmp=*p; *p = *q; *q = tmp; } // tourx and toury aligned on 32-byte boundary void tsp(point cities[], double tourx[], double toury[], int ncities) { int i,kk; tourx[0] = cities[ncities-1].x; toury[0] = cities[ncities-1].y; for (i=1; i= 8) { // scalar prologue int nlast = (k + 3) & (-4); for (; k < nlast; ++k) { double ThisDist = sqr(tourx[k]-ThisX)+sqr(toury[k]-ThisY); if (ThisDist <= CloseDist) { CloseDist = ThisDist; min_k = k; } } // main loop int ni = (ncities - k) / 4; __m256d thisx_v = _mm256_set1_pd(ThisX); __m256d thisy_v = _mm256_set1_pd(ThisY); __m256d minDist_v = _mm256_set1_pd(CloseDist); __m256d minK_v = _mm256_castsi256_pd(_mm256_set1_epi64x(min_k)); // __m256i really, just defined as __m256d to reduce # of casts __m256i k_v = _mm256_setr_epi64x((k+0), (k+1), (k+2), (k+3)); __m256i kInc_v = _mm256_set1_epi64x(4); const __m256d* tourx_v = (const __m256d*)&tourx[k]; const __m256d* toury_v = (const __m256d*)&toury[k]; k += ni*4; for (kk = 0; kk < ni; ++kk) { __m256d dx_v = _mm256_sub_pd(tourx_v[kk], thisx_v); __m256d dy_v = _mm256_sub_pd(toury_v[kk], thisy_v); _mm_prefetch(&tourx_v[kk+10], 0); _mm_prefetch(&toury_v[kk+10], 0); __m256d dist_v = _mm256_fmadd_pd(dy_v, dy_v, _mm256_mul_pd(dx_v, dx_v)); __m256d le_v = _mm256_cmp_pd(dist_v, minDist_v, _CMP_LE_OQ); minDist_v = _mm256_min_pd(minDist_v, dist_v); minK_v = _mm256_blendv_pd(minK_v, _mm256_castsi256_pd(k_v), le_v); k_v = _mm256_add_epi64(k_v, kInc_v); } // fold __m256d minDist_vh = _mm256_permute2f128_pd(minDist_v, minDist_v, 1); __m256d minK_vh = _mm256_permute2f128_pd(minK_v, minK_v, 1); __m256d le_v = _mm256_cmp_pd(minDist_v, minDist_vh, _CMP_LE_OQ); minDist_v = _mm256_blendv_pd(minDist_vh, minDist_v, le_v); minK_v = _mm256_blendv_pd(minK_vh, minK_v, le_v); minDist_vh = _mm256_permute_pd(minDist_v, 1); minK_vh = _mm256_permute_pd(minK_v, 1); le_v = _mm256_cmp_pd(minDist_v, minDist_vh, _CMP_LE_OQ); minDist_v = _mm256_blendv_pd(minDist_vh, minDist_v, le_v); minK_v = _mm256_blendv_pd(minK_vh, minK_v, le_v); _mm_store_sd(&CloseDist, _mm256_castpd256_pd128(minDist_v)); uint64_t min_kd; memcpy(&min_kd, &minK_v, sizeof(uint64_t)); min_k = (int)min_kd; } // scalar epilogue for (; k < ncities; ++k) { double ThisDist = sqr(tourx[k]-ThisX)+sqr(toury[k]-ThisY); if (ThisDist <= CloseDist) { CloseDist = ThisDist; min_k = k; } } swap(&tourx[i],&tourx[min_k]); swap(&toury[i],&toury[min_k]); } } int main(int argc, char *argv[]) { int i, ncities; point *cities; double *tourx; double *toury; FILE *psfile; double sumdist = 0.0; if (argc!=2) { fprintf(stderr, "usage: %s ", argv[0]); exit(1); } ncities = atoi(argv[1]); cities = alloca(ncities*sizeof(point)); tourx=alloca((ncities+4)*sizeof(double))+4*sizeof(double); toury=alloca(ncities*sizeof(double)); for (i=0; i