-
Notifications
You must be signed in to change notification settings - Fork 0
Home
Jeff Hammond edited this page Jun 29, 2014
·
1 revision
See Wikipedia.
See https://github.com/jeffhammond/KahanMPI
#ifdef DEBUG
# include <cstdio>
#endif
#include <cstdlib> /* declares NULL */
#include <cmath> /* declares ldexp() */
#include <new> /* declares new[] */
#include <mpi.h>
template <class T>
T BasicSum(int n, T * input)
{
T sum = T(0);
for (int i=0; i<n; ++i)
sum += input[i];
return sum;
}
/* Adapted from http://en.wikipedia.org/wiki/Kahan_summation_algorithm */
template <class T>
T KahanSum(int n, T * input)
{
T sum = T(0);
T c = T(0); // A running compensation for lost low-order bits.
for (int i=0; i<n; ++i) {
T y = input[i] - c; // So far, so good: c is zero.
T t = sum + y; // Alas, sum is big, y small, so low-order digits of y are lost.
c = (t - sum) - y; // (t - sum) recovers the high-order part of y; subtracting y recovers -(low part of y)
sum = t; // Algebraically, c should always be zero. Beware overly-aggressive optimising compilers!
} // Next time around, the lost low part will be added to y in a fresh attempt.
return sum;
}
#ifdef TEST_KAHAN_SUM
int main(int argc, char * argv[])
{
int n = (argc>1) ? atoi(argv[1]) : 100;
if (n<100) n*=100;
printf("Testing KahanSum for n=%d\n", n);
double * buf1 = new double[n];
double * buf2 = new double[n];
for (int i=0; i<n; ++i)
buf1[i] = ldexp(1.0, i%64 - 32);
for (int i=0; i<n; ++i)
buf2[i] = buf1[n-i-1];
#ifdef DEBUG
for (int i=0; i<n; ++i)
printf("buf1[%d] = %40.20lf buf2[%d] = %40.20lf \n", i, buf1[i], i, buf2[i] );
#endif
double b1 = BasicSum<double>(n,buf1);
double k1 = KahanSum<double>(n,buf1);
double b2 = BasicSum<double>(n,buf2);
double k2 = KahanSum<double>(n,buf2);
delete[] buf2;
delete[] buf1;
printf("BasicSum result 1 = %40.20lf \n", b1);
printf("KahanSum result 1 = %40.20lf \n", k1);
printf("difference 1 = %40.20lf \n", b1-k1);
printf("BasicSum result 2 = %40.20lf \n", b2);
printf("KahanSum result 2 = %40.20lf \n", k2);
printf("difference 2 = %40.20lf \n", b2-k2);
fflush(stdout);
return 0;
}
#endif
template <class T>
void KahanWrapper(int commsize, int count, T * tempbuf, T * result)
{
/* The Kahan algorithm reduces a contiguous vector to a scalar.
* Since we do not get a contiguous vector from (all)gather unless count=1,
* we need to transpose the data, which is either hard in the case of in-place
* or uses twice the memory in the case of out-of-place, or we need to form
* each contiguous vector as needed. We chose the latter. */
T * kahanbuf = new T[commsize];
//rc = MPI_Alloc_mem( (MPI_Aint)(commsize*sizeof(T)), MPI_INFO_NULL, (void**) &kahanbuf);
//if (rc!=MPI_SUCCESS) return rc;
for (int j=0; j<count; ++j) {
for (int i=0; i<commsize; ++i) {
kahanbuf[i] = tempbuf[i*commsize+j];
}
result[j] = KahanSum<T>(commsize, kahanbuf);
}
delete[] kahanbuf;
//rc = MPI_Free_mem((void*)kahanbuf);
//if (rc!=MPI_SUCCESS) return rc;
}
#ifdef TEST_KAHAN_WRAPPER
int main(int argc, char * argv[])
{
return 0;
}
#endif
#ifdef __cplusplus
extern "C" {
#endif
int My_Allreduce_kahan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm )
{
int rc = MPI_SUCCESS;
#ifdef DEBUG
fprintf(stderr, "Using deterministic MPI_Allreduce \n");
#endif
int commrank = 0;
rc = MPI_Comm_rank(comm, &commrank);
if (rc!=MPI_SUCCESS) return rc;
int commsize = 0;
rc = MPI_Comm_size(comm, &commsize);
if (rc!=MPI_SUCCESS) return rc;
int typesize = 0;
rc = MPI_Type_size(datatype, &typesize);
if (rc!=MPI_SUCCESS) return rc;
void * tempbuf = NULL;
rc = MPI_Alloc_mem( (MPI_Aint)(commsize*count*typesize), MPI_INFO_NULL, &tempbuf);
if (rc!=MPI_SUCCESS) return rc;
#ifdef USE_LOG_N_ALGORITHM
int root = 0;
rc = MPI_Gather(sendbuf, count, datatype, tempbuf, count, datatype, root, comm);
#else
rc = MPI_Allgather(sendbuf, count, datatype, tempbuf, count, datatype, comm);
#endif
if (rc!=MPI_SUCCESS) return rc;
#ifdef USE_LOG_N_ALGORITHM
if (commrank==root)
#endif
{
/* reduce tempbuf into recvbuf */
#if defined(USE_REDUCE_LOCAL)
for (int i=0; i<commsize; ++i) {
rc = MPI_Reduce_local( &(tempbuf[i*count]), recvbuf, count, datatype, op);
if (rc!=MPI_SUCCESS) return rc;
}
#elif defined(USE_KAHAN_SUM)
if (datatype==MPI_DOUBLE)
KahanWrapper<double>(commsize, count, (double*)tempbuf, (double*)recvbuf);
else {
MPI_Abort(MPI_COMM_WORLD, 1);
printf("KahanWrapper not instantiated for this type! \n");
}
#else
# error You need to define either USE_REDUCE_LOCAL or USE_KAHAN_SUM!
#endif
}
#ifdef USE_KAHAN_SUM
#endif
#ifdef USE_LOG_N_ALGORITHM
rc = MPI_Bcast(recvbuf, count, datatype, root, comm);
if (rc!=MPI_SUCCESS) return rc;
#endif
return MPI_SUCCESS;
}
/* MPI 3.0 added const to input i.e. read-only arguments... */
#if MPI_VERSION >= 3
int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm )
#else
int MPI_Allreduce(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm )
#endif
{
int rc = MPI_SUCCESS;
if (op==MPI_SUM)
rc = My_Allreduce_kahan(sendbuf, recvbuf, count, datatype, op, comm);
else
rc = PMPI_Allreduce(sendbuf, recvbuf, count, datatype, op, comm);
if (rc!=MPI_SUCCESS) return rc;
return MPI_SUCCESS;
}
#ifdef __cplusplus
}
#endif
mpicxx -g -Wall -O2 -DUSE_KAHAN_SUM -DTEST_KAHAN_SUM allreduce-stable.cc -lm -o allreduce-stable.x ./allreduce-stable.x 10000000