#include #include #include #include #include #include #include typedef struct { double x; double y; } point; double sqr(double x) { return x*x; } double dist(double x[], double y[], int i, int j) { return sqrt(sqr(x[i]-x[j])+ sqr(y[i]-y[j])); } double DistSqrd(point cities[], int i, int j) { return (sqr(cities[i].x-cities[j].x)+ sqr(cities[i].y-cities[j].y)); } void swap(double *p, double *q) { double tmp=*p; *p = *q; *q = tmp; } // tourx and toury aligned on 32-byte boundaries void tsp(point cities[], double tourx[], double toury[], int ncities) { int i,kk; tourx[0] = cities[ncities-1].x; toury[0] = cities[ncities-1].y; for (i=1; i 0) { // main loop __m256d thisx_v = _mm256_set1_pd(ThisX); __m256d thisy_v = _mm256_set1_pd(ThisY); __m256d minDist_v = _mm256_set1_pd(CloseDist); __m256d minK_v = _mm256_set1_pd((double)min_k); __m256d k_v = _mm256_setr_pd((double)(k+0), (double)(k+1), (double)(k+2), (double)(k+3)); __m256d kInc_v = _mm256_set1_pd((double)4); const __m256d* tourx_v = (const __m256d*)&tourx[k]; const __m256d* toury_v = (const __m256d*)&toury[k]; k += ni*4; for (kk = 0; kk < ni; ++kk) { __m256d dx_v = _mm256_sub_pd(tourx_v[kk], thisx_v); __m256d dy_v = _mm256_sub_pd(toury_v[kk], thisy_v); __m256d dist_v = _mm256_add_pd(_mm256_mul_pd(dx_v, dx_v),_mm256_mul_pd(dy_v, dy_v)); __m256d le_v = _mm256_cmp_pd(dist_v, minDist_v, _CMP_LE_OQ); minDist_v = _mm256_blendv_pd(minDist_v, dist_v, le_v); minK_v = _mm256_blendv_pd(minK_v, k_v, le_v); k_v = _mm256_add_pd(k_v, kInc_v); } // fold __m256d minDist_vh = _mm256_permute2f128_pd(minDist_v, minDist_v, 1); __m256d minK_vh = _mm256_permute2f128_pd(minK_v, minK_v, 1); __m256d le_v = _mm256_cmp_pd(minDist_v, minDist_vh, _CMP_LE_OQ); minDist_v = _mm256_blendv_pd(minDist_vh, minDist_v, le_v); minK_v = _mm256_blendv_pd(minK_vh, minK_v, le_v); minDist_vh = _mm256_permute_pd(minDist_v, 1); minK_vh = _mm256_permute_pd(minK_v, 1); le_v = _mm256_cmp_pd(minDist_v, minDist_vh, _CMP_LE_OQ); minDist_v = _mm256_blendv_pd(minDist_vh, minDist_v, le_v); minK_v = _mm256_blendv_pd(minK_vh, minK_v, le_v); // CloseDist = (double)_mm256_castpd256_pd128(minDist_v); // min_k = (double)_mm256_castpd256_pd128(minK_v); memcpy(&CloseDist, &minDist_v, sizeof(double)); double min_kd; memcpy(&min_kd, &minK_v, sizeof(double)); min_k = (int)min_kd; } // scalar epilog for (; k < ncities; ++k) { double ThisDist = sqr(tourx[k]-ThisX)+sqr(toury[k]-ThisY); if (ThisDist <= CloseDist) { CloseDist = ThisDist; min_k = k; } } swap(&tourx[i],&tourx[min_k]); swap(&toury[i],&toury[min_k]); } } // void tsp(point cities[], double tourx[], double toury[], int ncities) // { // int i; // double *ClosePt=tourx; // double CloseDist; // double *p; // // for (i=1; iCloseDist); // // ThisDist += sqr(toury[p-tourx]-ThisY); // if (p < tourx+i) // break; // CloseDist = sqr(*p-ThisX)+sqr(toury[p-tourx]-ThisY); // ClosePt = p; // } // swap(&tourx[i],ClosePt); // swap(&toury[i],&toury[ClosePt-tourx]); // } // } int main(int argc, char *argv[]) { int i, ncities; point *cities; double *tourx; double *toury; FILE *psfile; double sumdist = 0.0; if (argc!=2) { fprintf(stderr, "usage: %s ", argv[0]); exit(1); } ncities = atoi(argv[1]); cities = alloca(ncities*sizeof(point)); tourx=alloca((ncities+4)*sizeof(double))+4*sizeof(double); toury=alloca(ncities*sizeof(double)); for (i=0; i