Home

Table of Contents Algorithm Code Old Compile and Run

Algorithm

See Wikipedia.

Code

See https://github.com/jeffhammond/KahanMPI

Old

#ifdef DEBUG
#  include <cstdio>
#endif

#include <cstdlib>  /* declares NULL    */
#include <cmath>    /* declares ldexp() */
#include <new>      /* declares new[]   */
#include <mpi.h>

template <class T>
T BasicSum(int n, T * input) 
{
    T sum = T(0);
    for (int i=0; i<n; ++i) 
        sum += input[i];
    return sum;
}

/* Adapted from http://en.wikipedia.org/wiki/Kahan_summation_algorithm */
template <class T>
T KahanSum(int n, T * input) 
{
    T sum = T(0);
    T c   = T(0);                // A running compensation for lost low-order bits.
    for (int i=0; i<n; ++i) {
        T y = input[i] - c;      // So far, so good: c is zero.
        T t = sum + y;           // Alas, sum is big, y small, so low-order digits of y are lost.
        c = (t - sum) - y;       // (t - sum) recovers the high-order part of y; subtracting y recovers -(low part of y)
        sum = t;                 // Algebraically, c should always be zero. Beware overly-aggressive optimising compilers!
    }                            // Next time around, the lost low part will be added to y in a fresh attempt.
    return sum;
}

#ifdef TEST_KAHAN_SUM
int main(int argc, char * argv[])
{
    int n = (argc>1) ? atoi(argv[1]) : 100;
    if (n<100) n*=100;

    printf("Testing KahanSum for n=%d\n", n);

    double * buf1 = new double[n];
    double * buf2 = new double[n];

    for (int i=0; i<n; ++i)
        buf1[i] = ldexp(1.0, i%64 - 32);
    for (int i=0; i<n; ++i)
        buf2[i] = buf1[n-i-1];
#ifdef DEBUG
    for (int i=0; i<n; ++i)
        printf("buf1[%d] = %40.20lf buf2[%d] = %40.20lf \n", i, buf1[i], i, buf2[i] );
#endif

    double b1 = BasicSum<double>(n,buf1);
    double k1 = KahanSum<double>(n,buf1);
    double b2 = BasicSum<double>(n,buf2);
    double k2 = KahanSum<double>(n,buf2);

    delete[] buf2;
    delete[] buf1;

    printf("BasicSum result 1 = %40.20lf \n", b1);
    printf("KahanSum result 1 = %40.20lf \n", k1);
    printf("difference      1 = %40.20lf \n", b1-k1);
    printf("BasicSum result 2 = %40.20lf \n", b2);
    printf("KahanSum result 2 = %40.20lf \n", k2);
    printf("difference      2 = %40.20lf \n", b2-k2);
    fflush(stdout);

    return 0;
}
#endif

template <class T>
void KahanWrapper(int commsize, int count, T * tempbuf, T * result) 
{
        /* The Kahan algorithm reduces a contiguous vector to a scalar.
         * Since we do not get a contiguous vector from (all)gather unless count=1,
         * we need to transpose the data, which is either hard in the case of in-place
         * or uses twice the memory in the case of out-of-place, or we need to form
         * each contiguous vector as needed.  We chose the latter. */

        T * kahanbuf = new T[commsize];
        //rc = MPI_Alloc_mem( (MPI_Aint)(commsize*sizeof(T)), MPI_INFO_NULL, (void**) &kahanbuf);
        //if (rc!=MPI_SUCCESS) return rc;

        for (int j=0; j<count; ++j) {
            for (int i=0; i<commsize; ++i) {
                kahanbuf[i] = tempbuf[i*commsize+j];
            }
            result[j] = KahanSum<T>(commsize, kahanbuf);
        }

        delete[] kahanbuf;
        //rc = MPI_Free_mem((void*)kahanbuf);
        //if (rc!=MPI_SUCCESS) return rc;

}

#ifdef TEST_KAHAN_WRAPPER
int main(int argc, char * argv[])
{
    return 0;
}
#endif

#ifdef __cplusplus
extern "C" {
#endif

int My_Allreduce_kahan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm ) 
{
    int rc = MPI_SUCCESS;

#ifdef DEBUG
    fprintf(stderr, "Using deterministic MPI_Allreduce \n");
#endif

    int commrank = 0;
    rc = MPI_Comm_rank(comm, &commrank);
    if (rc!=MPI_SUCCESS) return rc;

    int commsize = 0;
    rc = MPI_Comm_size(comm, &commsize);
    if (rc!=MPI_SUCCESS) return rc;

    int typesize = 0;
    rc = MPI_Type_size(datatype, &typesize);
    if (rc!=MPI_SUCCESS) return rc;

    void * tempbuf = NULL;
    rc = MPI_Alloc_mem( (MPI_Aint)(commsize*count*typesize), MPI_INFO_NULL, &tempbuf);
    if (rc!=MPI_SUCCESS) return rc;

#ifdef USE_LOG_N_ALGORITHM
    int root = 0;
    rc = MPI_Gather(sendbuf, count, datatype, tempbuf, count, datatype, root, comm);
#else
    rc = MPI_Allgather(sendbuf, count, datatype, tempbuf, count, datatype, comm);
#endif
    if (rc!=MPI_SUCCESS) return rc;

#ifdef USE_LOG_N_ALGORITHM
    if (commrank==root)
#endif
    {
        /* reduce tempbuf into recvbuf */
#if defined(USE_REDUCE_LOCAL)
        for (int i=0; i<commsize; ++i) {
            rc = MPI_Reduce_local( &(tempbuf[i*count]), recvbuf, count, datatype, op);
            if (rc!=MPI_SUCCESS) return rc;
        }
#elif defined(USE_KAHAN_SUM)
        if (datatype==MPI_DOUBLE)
            KahanWrapper<double>(commsize, count, (double*)tempbuf, (double*)recvbuf);
        else {
            MPI_Abort(MPI_COMM_WORLD, 1);
            printf("KahanWrapper not instantiated for this type! \n");
        }
#else
#  error You need to define either USE_REDUCE_LOCAL or USE_KAHAN_SUM!
#endif
    }

#ifdef USE_KAHAN_SUM
#endif

#ifdef USE_LOG_N_ALGORITHM
    rc = MPI_Bcast(recvbuf, count, datatype, root, comm);
    if (rc!=MPI_SUCCESS) return rc;
#endif

    return MPI_SUCCESS;
}

/* MPI 3.0 added const to input i.e. read-only arguments... */
#if MPI_VERSION >= 3
int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm ) 
#else
int MPI_Allreduce(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm ) 
#endif
{
    int rc = MPI_SUCCESS;

    if (op==MPI_SUM)
        rc = My_Allreduce_kahan(sendbuf, recvbuf, count, datatype, op, comm); 
    else
        rc = PMPI_Allreduce(sendbuf, recvbuf, count, datatype, op, comm);
    if (rc!=MPI_SUCCESS) return rc;

    return MPI_SUCCESS;
}

#ifdef __cplusplus
}
#endif

Compile and Run

mpicxx -g -Wall -O2 -DUSE_KAHAN_SUM -DTEST_KAHAN_SUM allreduce-stable.cc -lm -o allreduce-stable.x
./allreduce-stable.x 10000000

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Home

Table of Contents

Algorithm

Code

Old

Compile and Run

Clone this wiki locally