Real objFunc(Real *p)
{
int rank,op;
Real sum=0.;
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
cudaSetDevice(rank%nGPUperNode);
if(nExamples == 0) {
cerr << "data not set " << endl; exit(1);
}
CalcError getError(thrust::raw_pointer_cast(&d_data[0]),
thrust::raw_pointer_cast(&d_param[0]),
nInput, exLen);
if(rank > 0) { // slave objective function
Real *param;
cudaHostAlloc(¶m, sizeof(Real)*nParam,cudaHostAllocPortable);
for(;;) { // loop until the master says I am done.
MPI_Bcast(&op, 1, MPI_INT, 0, MPI_COMM_WORLD);
if(op==0) {
cudaFreeHost(param);
return(0);
}
if(sizeof(Real) == sizeof(float))
MPI_Bcast(¶m[0], nParam, MPI_FLOAT, 0, MPI_COMM_WORLD);
else
MPI_Bcast(¶m[0], nParam, MPI_DOUBLE, 0, MPI_COMM_WORLD);
thrust::copy(param, param+nParam, d_param.begin());
Real mySum = thrust::transform_reduce(
thrust::counting_iterator<unsigned int>(0),
thrust::counting_iterator<unsigned int> (nExamples),
getError,
(Real) 0.,
thrust::plus<Real>());
if(sizeof(Real) == sizeof(float))
MPI_Reduce(&mySum, &sum, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
else
MPI_Reduce(&mySum, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
}
} else { // master process
double startTime=omp_get_wtime();
op=1;
MPI_Bcast(&op, 1, MPI_INT, 0, MPI_COMM_WORLD);
if(sizeof(Real) == sizeof(float))
MPI_Bcast(&p[0], nParam, MPI_FLOAT, 0, MPI_COMM_WORLD);
else
MPI_Bcast(&p[0], nParam, MPI_DOUBLE, 0, MPI_COMM_WORLD);
thrust::copy(p, p+nParam, d_param.begin());
Real mySum = thrust::transform_reduce(
thrust::counting_iterator<unsigned int>(0),
thrust::counting_iterator<unsigned int>(nExamples),
getError,
(Real) 0.,
thrust::plus<Real>());
if(sizeof(Real) == sizeof(float))
MPI_Reduce(&mySum, &sum, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
else
MPI_Reduce(&mySum, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
objFuncCallTime += (omp_get_wtime() - startTime);
objFuncCallCount++;
}
return(sum);
}