BROWSER/doc_surf81/order__independent__summation__mod_8F90_source.html

 MODULE order_independent_summation_mod

 !**** ORDER_INDEPENDENT_SUMMATION_MOD

 !     Purpose.
 !     --------
 !     Functions to perform global (over all processors) and
 !     local (per-processor) order-independent accurate summation
 !     and order-independent inner products.

 !**   Interface.
 !     ----------

 !        result = ORDER_INDEP_GLOBAL_SUM (P1)
 !        result = ORDER_INDEP_LOCAL_SUM (P1)
 !        result = ORDER_INDEP_DOT_PRODUCT (P1,P2,PW)

 !        Input required arguments :
 !        -------------------------
 !           P1       -  A 1d array of KIND=JPRB reals
 !           P2       -  A 1d array of KIND=JPRB reals. Same length as P1.

 !        Input optional arguments :
 !        -------------------------
 !           PW       -  A 1d array of KIND=JPRB reals. Same length as P1.
 !                       If specified. ORDER_INDEP_DOT_PRODUCT returns
 !                       a weighted inner product. PW defines the weights.
 !                       If not specified, ORDER_INDEP_DOT_PRODUCT returns
 !                       an unweighted dot product of P and P2.
 !
 !           KNG (global sum only) - Global length of input array.
 !
 !           LD_ABORT_IFNOT_REPROD - Abort if results are not guaranteed
 !                                   to be bit-reproducible. (Default=T)
 !
 !           LD_OPENMP             - .TRUE. => use openMP (Default is
 !                                   .TRUE. for global sum, .FALSE. for
 !                                   local sum.)

 !        Output required arguments :
 !        -------------------------
 !           none

 !     Author.
 !     -------
 !        Mike Fisher  ECMWF

 !     Modifications.
 !     --------------
 !        Original: 2006-20-22

 !     ------------------------------------------------------------------

 USE parkind1  ,ONLY : jpim     ,jprb
 USE compensated_summation_mod, ONLY : compensated_sum, compensated_sum_omp, &
                                 & compensated_dot_product, &
                                 & compensated_dot_product_omp
 #ifdef SFX_MPI
 USE mpl_module, ONLY : mpl_allreduce, mpl_allgatherv, mpl_myrank, mpl_nproc, &
                      & mpl_message, mpl_send, mpl_recv, mpl_wait, &
                      & jp_non_blocking_standard
 #endif
 USE yomhook   , ONLY : lhook,   dr_hook

 SAVE
 PRIVATE
 PUBLIC order_indep_local_sum, &
      & order_indep_global_sum, &
      & order_indep_global_sum2, &
      & order_indep_allreduce, &
      & order_indep_dot_product

 INTERFACE order_indep_local_sum
   MODULE PROCEDURE order_indep_local_sum
 END INTERFACE

 INTERFACE order_indep_global_sum
   MODULE PROCEDURE order_indep_global_sum
 END INTERFACE

 INTERFACE order_indep_global_sum2
   MODULE PROCEDURE order_indep_global_sum2
 END INTERFACE

 INTERFACE order_indep_allreduce
   MODULE PROCEDURE order_indep_allreduce
 END INTERFACE

 INTERFACE order_indep_dot_product
   MODULE PROCEDURE order_indep_dot_product
 END INTERFACE

 CONTAINS

 FUNCTION order_indep_local_sum (PIN,LD_ABORT_IFNOT_REPROD,LD_OPENMP)

 !-----------------------------------------------------------------
 !  Returns an accurate local (i.e. on a single processor) sum of
 !  the elements of PIN. The sum is bit-reproducible for any
 !  ordering of the elements of PIN.
 !
 !  NB: PIN is unmodified on return
 !
 ! Algorithm:
 ! ----------
 !
 !  The algorithm is based on Ogita et al. (2005) SIAM J. Sci. Computing,
 !  Vol.26, No.6, pp1955-1988. This is based in turn on an algorithm
 !  by Knuth (1969, seminumerical algorithms).
 !
 !  This version iterates the compensated sum algorithm until the
 !  result is guaranteed to be within 4*eps of the true sum. It
 !  then rounds the result to the nearest floating-point number
 !  whose last three bits are zero, thereby guaranteeing an
 !  order-independent result.
 !
 !  Author: Mike Fisher ECMWF 2006/02/08
 !
 !-----------------------------------------------------------------

 IMPLICIT NONE

 REAL(KIND=JPRB) :: ORDER_INDEP_LOCAL_SUM
 REAL(KIND=JPRB), INTENT(IN) :: PIN(:)
 LOGICAL,OPTIONAL,INTENT(IN) :: LD_ABORT_IFNOT_REPROD, LD_OPENMP

 INTEGER(KIND=JPIM) :: IN
 REAL(KIND=JPRB) :: ZCORR,ZERR,ZOLDERR,ZBETA,ZRES
 REAL(KIND=JPRB), ALLOCATABLE :: ZP(:)
 LOGICAL :: LLABORT, LL_OPENMP
 REAL(KIND=JPRB) :: ZHOOK_HANDLE

 INTEGER(KIND=JPIM), SAVE :: INMSG=0

 INTEGER(KIND=JPIM), EXTERNAL :: N_PRECISION

 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_LOCAL_SUM', &
                       &  0,zhook_handle)

 IF (PRESENT(ld_abort_ifnot_reprod)) THEN
   llabort = ld_abort_ifnot_reprod
 ELSE
   llabort = .true.
 ENDIF

 IF (PRESENT(ld_openmp)) THEN
   ll_openmp = ld_openmp
 ELSE
   ll_openmp = .false.
 ENDIF

 in = SIZE(pin)

 IF (REAL(2*in,jprb)*EPSILON(zres) >= 1.0) then
 #ifdef SFX_MPI
   CALL mpl_message (cdmessage='n is too large to guarantee error bounds', &
                     & cdstring='ORDER_INDEP_LOCAL_SUM',ldabort=.true.)
 #endif
 ENDIF

 test_array_length: IF (in>0) THEN
   zolderr = huge(zerr)

 !--- Copy the input array. This avoids some tricky indexing, at the
 !--- expense of some inefficency.

   ALLOCATE (zp(in))
   zp(:) = pin(:)

   k_loop: DO

 !--- transform local arrays

     IF (ll_openmp) THEN
       CALL compensated_sum_omp (zp,in,zcorr,zerr)
     ELSE
       CALL compensated_sum (zp,in,zcorr,zerr)
     ENDIF

 !--- Calculate final result

     zres = zp(in) + zcorr

 !--- Calculate error bound. This is corollary 4.7 from Ogita et al. (2005)

     zbeta = zerr*(REAL(2*in,jprb)*EPSILON(zres)) &
          & /(1.0_JPRB - REAL(2*IN,JPRB)*EPSILON(ZRES))

     zerr = epsilon(zres)*abs(zres) &
        & +(zbeta + ( 2.0_jprb*epsilon(zres)*epsilon(zres)*abs(zres) &
        &            +3.0_jprb*tiny(zres)))

 !--- exit if the error is small enough

     IF (zerr<4.0_jprb*spacing(zres)) EXIT k_loop

 !--- Take appropriate action if ZRES cannot be sufficiently refined.

     IF (zerr >= zolderr) THEN
       inmsg=inmsg+1

       IF (inmsg<=100) THEN
 #ifdef SFX_MPI
         CALL mpl_message ( &
             & cdmessage= 'ORDER_INDEP_LOCAL_SUM: FALIED TO REFINE SUM', &
             & cdstring='ORDER_INDEP_LOCAL_SUM')
         CALL mpl_message ( &
             & cdmessage='WARNING: POSSIBLITY OF NON-REPRODUCIBLE RESULTS',&
             & cdstring='ORDER_INDEP_LOCAL_SUM')
 #endif
       ENDIF

       IF (inmsg==100) THEN
 #ifdef SFX_MPI
         CALL mpl_message ( &
             & cdmessage='ORDER_INDEP_LOCAL_SUM: INMSG>100. OUTPUT SUPPRESSED',&
             & cdstring='ORDER_INDEP_LOCAL_SUM')
 #endif
       ENDIF

       IF (llabort) THEN
 #ifdef SFX_MPI
         CALL mpl_message (cdmessage= &
                         & 'ABORT BECAUSE LD_ABORT_IFNOT_REPROD WAS SET', &
                         & cdstring='ORDER_INDEP_LOCAL_SUM',ldabort=.true.)
 #endif
       ENDIF
     ENDIF

     zolderr = zerr

   ENDDO k_loop

 !--- At this stage, we have guaranteed that ZRES is less than 4*EPS
 !--- away from the exact sum. There are only eight floating point
 !--- numbers in this range. So, if we find the nearest number that
 !--- has its last three bits zero, then we have a reproducible result.

   order_indep_local_sum = round(zres)

   DEALLOCATE (zp)
 ELSE test_array_length

   order_indep_local_sum = 0.0_jprb

 ENDIF test_array_length

 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_LOCAL_SUM', &
                       &  1,zhook_handle)

 END FUNCTION order_indep_local_sum

 FUNCTION order_indep_global_sum (PIN,KNG,LD_ABORT_IFNOT_REPROD, LD_OPENMP)

 !-----------------------------------------------------------------
 !
 !  Returns an accurate global sum of the elements of PIN. The
 !  sum is bit-reproducible for any distribution of PIN over
 !  threads and tasks, and is independent of the ordering of the
 !  elements of PIN.
 !
 !  NB: PIN is unmodified on return
 !
 ! Arguments:
 ! ----------
 !
 ! Required:
 !
 !   PIN                  - INTENT(IN) - The array to be summed.
 !
 !
 ! Optional:
 !
 !   KNG                  - INTENT(IN) - Global length of array.
 !
 !   LD_ABORT_IFNOT_REPROD - INTENT(IN) - Defines behaviour in case
 !                                        a reproducible result cannot
 !                                        be guaranteed.
 !
 !   LD_OPENMP            - INTENT(IN) - Use OpenMP parallelization.
 !
 !
 ! Algorithm:
 ! ----------
 !
 !  The algorithm is based on Ogita et al. (2005) SIAM J. Sci. Computing,
 !  Vol.26, No.6, pp1955-1988. This is based in turn on an algorithm
 !  by Knuth (1969, seminumerical algorithms).
 !
 !  This version adds a second layer of parallelism on top of that
 !  provided by COMPENSATED_SUM_OMP. It iterates the compensated
 !  summation until the result is guaranteed to be within 4*eps
 !  of the true sum. It then rounds the result to the nearest
 !  floating-point number whose last three bits are zero, thereby
 !  guaranteeing an order-independent result.
 !
 !  Author: Mike Fisher ECMWF 2006/02/08
 !
 !-----------------------------------------------------------------

 IMPLICIT NONE

 REAL(KIND=JPRB) :: ORDER_INDEP_GLOBAL_SUM

 REAL(KIND=JPRB),             INTENT(IN) :: PIN(:)
 INTEGER(KIND=JPIM),OPTIONAL, INTENT(IN) :: KNG
 LOGICAL,           OPTIONAL, INTENT(IN) :: LD_ABORT_IFNOT_REPROD, LD_OPENMP

 INTEGER(KIND=JPIM) :: J,IN,ING,INPROC
 REAL(KIND=JPRB) :: ZCORR,ZERR,ZOLDERR,ZBUFFL(3),ZBETA,ZRES
 REAL(KIND=JPRB), ALLOCATABLE :: ZPSUMS(:),ZPERRS(:),ZPCORS(:), &
                               & ZBUFFG(:),ZP(:)
 INTEGER(KIND=JPIM), ALLOCATABLE :: IRECVCOUNTS(:)
 LOGICAL :: LLABORT, LL_OPENMP
 REAL(KIND=JPRB) :: ZHOOK_HANDLE

 INTEGER(KIND=JPIM), SAVE :: INMSG=0

 INTEGER(KIND=JPIM), EXTERNAL :: N_PRECISION

 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_GLOBAL_SUM', &
                       &  0,zhook_handle)

 #ifdef SFX_MPI
 inproc = mpl_nproc()
 #else
 inproc = 1
 #endif
 IF (PRESENT(ld_abort_ifnot_reprod)) THEN
   llabort = ld_abort_ifnot_reprod
 ELSE
   llabort = .true.
 ENDIF

 IF (PRESENT(ld_openmp)) THEN
   ll_openmp = ld_openmp
 ELSE
   ll_openmp = .true.
 ENDIF

 in = SIZE(pin)

 !--- global length of vector (needed for error bound calculation)

 IF (.NOT.PRESENT(kng)) THEN
   ing = in
   IF (inproc>1) THEN
 #ifdef SFX_MPI
     CALL mpl_allreduce (ing,'SUM',cdstring='ORDER_INDEP_GLOBAL_SUM')
 #endif
   ENDIF
 ELSE
   ing = kng
   IF (kng<in) THEN
 #ifdef SFX_MPI
     CALL mpl_message (cdmessage='Specified KNG < SIZE(PIN)', &
                     & cdstring='ORDER_INDEP_GLOBAL_SUM',ldabort=.true.)
 #endif
   ENDIF
 ENDIF

 IF (REAL(2*ing,jprb)*EPSILON(zres) >= 1.0) then
 #ifdef SFX_MPI
   CALL mpl_message (cdmessage='n is too large to guarantee error bounds', &
                   & cdstring='ORDER_INDEP_GLOBAL_SUM',ldabort=.true.)
 #endif
 ENDIF

 ALLOCATE (zp(max(in,1_jpim)))
 ALLOCATE (zbuffg(inproc*SIZE(zbuffl)))
 ALLOCATE (zpsums(inproc))
 ALLOCATE (zperrs(inproc))
 ALLOCATE (zpcors(inproc))
 ALLOCATE (irecvcounts(inproc))

 zolderr = huge(zerr)

 !--- Copy the input array. This avoids some tricky indexing, at the
 !--- expense of some inefficency.

 IF (in>0) THEN
   zp(:) = pin(:)
 ELSE
   zp(1) = 0.0_jprb
 ENDIF

 k_loop: DO

 !--- transform local arrays

   IF (in>0) THEN
     IF (ll_openmp) THEN
       CALL compensated_sum_omp (zp,in,zcorr,zerr)
     ELSE
       CALL compensated_sum (zp,in,zcorr,zerr)
     ENDIF
   ENDIF

 !--- gather partial sums and error bounds to all processors

   zbuffl(1) = zp(max(in,1_jpim))

   IF (in>0) THEN
     zbuffl(2) = zerr
     zbuffl(3) = zcorr
   ELSE
     zbuffl(2) = 0.0_jprb
     zbuffl(3) = 0.0_jprb
   ENDIF

   IF (inproc>1) THEN
 !-- could use MPL_ALLGATHER here, if it existed!

     irecvcounts(:) = SIZE(zbuffl)
 #ifdef SFX_MPI
     CALL mpl_allgatherv (zbuffl,zbuffg,irecvcounts, &
                        & cdstring='ORDER_INDEP_GLOBAL_SUM')
 #endif
     DO j=1,inproc
       zpsums(j) = zbuffg(1+(j-1)*SIZE(zbuffl))
       zperrs(j) = zbuffg(2+(j-1)*SIZE(zbuffl))
       zpcors(j) = zbuffg(3+(j-1)*SIZE(zbuffl))
     ENDDO
   ELSE
     zpsums(1) = zbuffl(1)
     zperrs(1) = zbuffl(2)
     zpcors(1) = zbuffl(3)
   ENDIF

 !--- transform partial sums

   CALL compensated_sum (zpsums,inproc,zcorr,zerr)
   zerr  = zerr  + sum(zperrs)
   zcorr = zcorr + sum(zpcors)

 !--- Calculate final result

   zres = zpsums(inproc) + zcorr

 !--- Calculate error bound. This is corollary 4.7 from Ogita et al. (2005)

   zbeta = zerr*(REAL(2*ing,jprb)*EPSILON(zres)) &
        & /(1.0_JPRB - REAL(2*ING,JPRB)*EPSILON(ZRES))

   zerr = epsilon(zres)*abs(zres) &
      & +(zbeta + ( 2.0_jprb*epsilon(zres)*epsilon(zres)*abs(zres) &
      &            +3.0_jprb*tiny(zres)))

 !--- update the last element of the local array

 #ifdef SFX_MPI
   zp(max(in,1_jpim)) = zpsums(mpl_myrank())
 #endif

 !--- exit if the global error is small enough

   IF (zerr<4.0_jprb*spacing(zres)) EXIT k_loop

 !--- Take appropriate action if ZRES cannot be sufficiently refined.

   IF (zerr >= zolderr) THEN
     inmsg=inmsg+1

     IF (inmsg<=100) THEN
 #ifdef SFX_MPI
       CALL mpl_message ( &
           & cdmessage= 'ORDER_INDEP_GLOBAL_SUM: FALIED TO REFINE SUM', &
           & cdstring='ORDER_INDEP_GLOBAL_SUM')
       CALL mpl_message ( &
           & cdmessage='WARNING: POSSIBLITY OF NON-REPRODUCIBLE RESULTS',&
           & cdstring='ORDER_INDEP_GLOBAL_SUM')
 #endif
     ENDIF

     IF (inmsg==100) THEN
 #ifdef SFX_MPI
       CALL mpl_message ( &
           & cdmessage='ORDER_INDEP_GLOBAL_SUM: INMSG>100. OUTPUT SUPPRESSED',&
           & cdstring='ORDER_INDEP_GLOBAL_SUM')
 #endif
     ENDIF

     IF (llabort) THEN
 #ifdef SFX_MPI
       CALL mpl_message (cdmessage= &
                       & 'ABORT BECAUSE LD_ABORT_IFNOT_REPROD WAS SET', &
                       & cdstring='ORDER_INDEP_GLOBAL_SUM',ldabort=.true.)
 #endif
     ENDIF
   ENDIF

   zolderr = zerr

 ENDDO k_loop

 !--- At this stage, we have guaranteed that ZRES less than 4*EPS
 !--- away from the exact sum. There are only four floating point
 !--- numbers in this range. So, if we find the nearest number that
 !--- has its last three bits zero, then we have a reproducible result.

 order_indep_global_sum = round(zres)

 DEALLOCATE (irecvcounts)
 DEALLOCATE (zpcors)
 DEALLOCATE (zperrs)
 DEALLOCATE (zpsums)
 DEALLOCATE (zbuffg)
 DEALLOCATE (zp)

 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_GLOBAL_SUM', &
                       &  1,zhook_handle)

 END FUNCTION order_indep_global_sum

 SUBROUTINE order_indep_global_sum2 (PIN,POUT,KNVEC,KDIM,KNL,LD_ABORT_IFNOT_REPROD,LD_OPENMP)

 !-----------------------------------------------------------------
 !
 !  This is a vector version of ORDER_INDEP_GLOBAL_SUM, which
 !  returns a vector of accurate global sums of the elements of matrix
 !  PIN along dimension KDIM. The  sum is bit-reproducible for any
 !  distribution of PIN over  threads and tasks, and is independent of
 !  the ordering of the elements of PIN.
 !
 !  NB: PIN is unmodified on return
 !
 ! Arguments:
 ! ----------
 !
 ! Required:
 !
 !   PIN                  - INTENT(IN)  - The array to be summed.
 !
 !   POUT                 - INTENT(OUT) - The vector with sums
 !
 !   KDIM                 - INTENT(IN)  - The dimension to sum along.
 !
 !   KNL                  - INTENT(IN)  - Local lengths of vectors.
 !
 !
 ! Optional:
 !
 !   LD_ABORT_IFNOT_REPROD - INTENT(IN) - Defines behaviour in case
 !                                        a reproducible result cannot
 !                                        be guaranteed.
 !
 !   LD_OPENMP            - INTENT(IN)  - Use OpenMP parallelization.
 !
 !
 ! Algorithm:
 ! ----------
 !
 !  The algorithm is based on Ogita et al. (2005) SIAM J. Sci. Computing,
 !  Vol.26, No.6, pp1955-1988. This is based in turn on an algorithm
 !  by Knuth (1969, seminumerical algorithms).
 !
 !  This version adds a second layer of parallelism on top of that
 !  provided by COMPENSATED_SUM_OMP. It iterates the compensated
 !  summation until the result is guaranteed to be within 4*eps
 !  of the true sum. It then rounds the result to the nearest
 !  floating-point number whose last three bits are zero, thereby
 !  guaranteeing an order-independent result.
 !
 !  Author: Tomas Wilhelmsson ECMWF 2010/03/30
 !
 !-----------------------------------------------------------------

 IMPLICIT NONE

 INTEGER(KIND=JPIM),           INTENT(IN)  :: KNVEC
 REAL(KIND=JPRB),              INTENT(IN)  :: PIN(:,:)
 REAL(KIND=JPRB),              INTENT(OUT) :: POUT(knvec)
 INTEGER(KIND=JPIM),           INTENT(IN)  :: KDIM
 INTEGER(KIND=JPIM),           INTENT(IN)  :: KNL(knvec)
 LOGICAL,            OPTIONAL, INTENT(IN)  :: LD_ABORT_IFNOT_REPROD, LD_OPENMP

 INTEGER(KIND=JPIM) :: J,JL,JP,IBUFLEN,INVEC,INPROC,ING(knvec)
 REAL(KIND=JPRB), DIMENSION(KNVEC) :: ZCORR,ZERR,ZOLDERR,ZBETA,ZRES
 REAL(KIND=JPRB), ALLOCATABLE :: ZPSUMS(:,:),ZPERRS(:,:),ZPCORS(:,:), &
                               & ZBUFFL(:),ZBUFFG(:),ZP(:,:)
 INTEGER(KIND=JPIM), ALLOCATABLE :: IRECVCOUNTS(:)
 LOGICAL :: LLABORT, LL_OPENMP, LLDONE(knvec)
 REAL(KIND=JPRB) :: ZHOOK_HANDLE

 INTEGER(KIND=JPIM), SAVE :: INMSG=0

 INTEGER(KIND=JPIM), EXTERNAL :: N_PRECISION

 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_GLOBAL_SUM2', &
                       &  0,zhook_handle)

 IF (kdim<1 .OR. kdim>2) THEN
 #ifdef SFX_MPI
   CALL mpl_message (cdmessage='Invalid KDIM value', &
                   & cdstring='ORDER_INDEP_GLOBAL_SUM2',ldabort=.true.)
 #endif
 ENDIF
 #ifdef SFX_MPI
 inproc = mpl_nproc()
 #else
 inproc = 1
 #endif
 IF (PRESENT(ld_abort_ifnot_reprod)) THEN
   llabort = ld_abort_ifnot_reprod
 ELSE
   llabort = .true.
 ENDIF

 IF (PRESENT(ld_openmp)) THEN
   ll_openmp = ld_openmp
 ELSE
   ll_openmp = .true.
 ENDIF

 !--- global lengths of vectors (needed for error bound calculation)

 ing(:) = knl(:)
 IF (inproc>1) THEN
 #ifdef SFX_MPI
   CALL mpl_allreduce (ing,'SUM',cdstring='ORDER_INDEP_GLOBAL_SUM2')
 #endif
 ENDIF

 IF (any(REAL(2*ING(:),JPRB)*epsilon(ZRES) >= 1.0)) then
 #ifdef SFX_MPI
   CALL mpl_message (cdmessage='n is too large to guarantee error bounds', &
                   & cdstring='ORDER_INDEP_GLOBAL_SUM2',ldabort=.true.)
 #endif
 ENDIF

 ibuflen=3

 ALLOCATE (zp(max(maxval(knl),1_jpim),knvec))
 ALLOCATE (zbuffl(ibuflen*knvec))
 ALLOCATE (zbuffg(inproc*ibuflen*knvec))
 ALLOCATE (zpsums(inproc,knvec))
 ALLOCATE (zperrs(inproc,knvec))
 ALLOCATE (zpcors(inproc,knvec))
 ALLOCATE (irecvcounts(inproc))

 zolderr(:) = huge(zerr)
 lldone(:)  = .false.

 !--- Copy the input array. This avoids some tricky indexing, at the
 !--- expense of some inefficency.


 DO j=1,knvec
   IF (knl(j)>0) THEN
     IF (kdim==1) zp(1:knl(j),j) = pin(1:knl(j),j)
     IF (kdim==2) zp(1:knl(j),j) = pin(j,1:knl(j))
   ELSE
     zp(1,j) = 0.0_jprb
   ENDIF
 ENDDO

 k_loop: DO

 !--- transform local arrays

   jl=0
   DO j=1,knvec
     IF (knl(j)>0 .AND. .NOT. lldone(j)) THEN
       IF (ll_openmp) THEN
         CALL compensated_sum_omp (zp(:,j),knl(j),zcorr(j),zerr(j))
       ELSE
         CALL compensated_sum     (zp(:,j),knl(j),zcorr(j),zerr(j))
       ENDIF
     ENDIF

 !--- gather partial sums and error bounds to all processors

     zbuffl(jl+1) = zp(max(knl(j),1_jpim),j)

     IF (knl(j)>0) THEN
       zbuffl(jl+2) = zerr(j)
       zbuffl(jl+3) = zcorr(j)
     ELSE
       zbuffl(jl+2) = 0.0_jprb
       zbuffl(jl+3) = 0.0_jprb
     ENDIF
     jl = jl + ibuflen
   ENDDO

   IF (inproc>1) THEN
 !-- could use MPL_ALLGATHER here, if it existed!

     irecvcounts(:) = SIZE(zbuffl)
 #ifdef SFX_MPI
     CALL mpl_allgatherv (zbuffl,zbuffg,irecvcounts, &
                        & cdstring='ORDER_INDEP_GLOBAL_SUM2')
 #endif
     DO jp=1,inproc
       jl = 0
       DO j = 1,knvec
         zpsums(jp,j) = zbuffg(jl+1+(jp-1)*SIZE(zbuffl))
         zperrs(jp,j) = zbuffg(jl+2+(jp-1)*SIZE(zbuffl))
         zpcors(jp,j) = zbuffg(jl+3+(jp-1)*SIZE(zbuffl))
         jl = jl + ibuflen
       ENDDO
     ENDDO
   ELSE
     jl = 0
     DO j = 1,knvec
       zpsums(1,j) = zbuffl(jl+1)
       zperrs(1,j) = zbuffl(jl+2)
       zpcors(1,j) = zbuffl(jl+3)
     ENDDO
   ENDIF

 !--- transform partial sums

   DO j = 1,knvec
     IF (lldone(j)) cycle

     CALL compensated_sum (zpsums(:,j),inproc,zcorr(j),zerr(j))
     zerr(j)  = zerr(j)  + sum(zperrs(:,j))
     zcorr(j) = zcorr(j) + sum(zpcors(:,j))

 !--- Calculate final result

     zres(j) = zpsums(inproc,j) + zcorr(j)

 !--- Calculate error bound. This is corollary 4.7 from Ogita et al. (2005)

     zbeta(j) = zerr(j)*(REAL(2*ING(J),jprb)*epsilon(zres(J))) &
        & /(1.0_JPRB - REAL(2*ING(J),JPRB)*EPSILON(ZRES(J)))

     zerr(j) = epsilon(zres(j))*abs(zres(j)) &
      & +(zbeta(j) + ( 2.0_jprb*epsilon(zres(j))*epsilon(zres(j))*abs(zres(j)) &
      &            +3.0_jprb*tiny(zres(j))))

 !--- update the last element of the local array
 #ifdef SFX_MPI
     zp(max(knl(j),1_jpim),j) = zpsums(mpl_myrank(),j)
 #endif
   ENDDO

 !--- exit if the global error is small enough

   lldone(:) = (zerr(:)<4.0_jprb*spacing(zres(:))) .OR. lldone(:)

   IF (all(lldone(:))) EXIT k_loop

 !--- Take appropriate action if ZRES cannot be sufficiently refined.

   DO j = 1,knvec
     IF (zerr(j) >= zolderr(j) .AND. .NOT. lldone(j)) THEN
       inmsg=inmsg+1

       IF (inmsg<=100) THEN
 #ifdef SFX_MPI
         CALL mpl_message ( &
           & cdmessage= 'ORDER_INDEP_GLOBAL_SUM2: FALIED TO REFINE SUM', &
           & cdstring='ORDER_INDEP_GLOBAL_SUM2')
         CALL mpl_message ( &
           & cdmessage='WARNING: POSSIBLITY OF NON-REPRODUCIBLE RESULTS',&
           & cdstring='ORDER_INDEP_GLOBAL_SUM2')
 #endif
       ENDIF

       IF (inmsg==100) THEN
 #ifdef SFX_MPI
         CALL mpl_message ( &
           & cdmessage='ORDER_INDEP_GLOBAL_SUM2: INMSG>100. OUTPUT SUPPRESSED',&
           & cdstring='ORDER_INDEP_GLOBAL_SUM2')
 #endif
       ENDIF

       IF (llabort) THEN
 #ifdef SFX_MPI
         CALL mpl_message (cdmessage= &
           & 'ABORT BECAUSE LD_ABORT_IFNOT_REPROD WAS SET', &
           & cdstring='ORDER_INDEP_GLOBAL_SUM2',ldabort=.true.)
 #endif
       ENDIF
     ENDIF

     zolderr(j) = zerr(j)
   ENDDO

 ENDDO k_loop

 !--- At this stage, we have guaranteed that ZRES less than 4*EPS
 !--- away from the exact sum. There are only four floating point
 !--- numbers in this range. So, if we find the nearest number that
 !--- has its last three bits zero, then we have a reproducible result.

 DO j=1,knvec
   pout(j) = round(zres(j))
 ENDDO

 DEALLOCATE (irecvcounts)
 DEALLOCATE (zpcors)
 DEALLOCATE (zperrs)
 DEALLOCATE (zpsums)
 DEALLOCATE (zbuffg)
 DEALLOCATE (zbuffl)
 DEALLOCATE (zp)

 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_GLOBAL_SUM2', &
                       &  1,zhook_handle)

 END SUBROUTINE order_indep_global_sum2

 SUBROUTINE order_indep_allreduce (PIN,POUT,LD_ABORT_IFNOT_REPROD,LD_OPENMP)

 !-----------------------------------------------------------------
 !
 !  Returns in POUT an accurate global sum of the elements of PIN across tasks.
 !  This has a similar functionality to MPL_allreduce where an array is supplied
 !  and we want the individual elements of the array to be summed across tasks.
 !  This is different to ORDER_INDEP_GLOBAL_SUM where PIN is considered part of
 !  a global array.
 !
 ! Arguments:
 ! ----------
 !
 ! Required:
 !
 !   PIN                  - INTENT(IN) - input array to be summed.
 !   POUT                 - INTENT(OUT) - output array of same size
 !
 !
 ! Optional:
 !
 !   LD_ABORT_IFNOT_REPROD - INTENT(IN) - Defines behaviour in case
 !                                        a reproducible result cannot
 !                                        be guaranteed.
 !
 !   LD_OPENMP            - INTENT(IN) - Use OpenMP parallelization.
 !
 !
 !  Author: George Mozdzynski ECMWF June 2009
 !
 !-----------------------------------------------------------------

 IMPLICIT NONE

 REAL(KIND=JPRB) :: ORDER_INDEP_GLOBAL_SUM

 REAL(KIND=JPRB),             INTENT(IN) :: PIN(:)
 REAL(KIND=JPRB),             INTENT(OUT):: POUT(:)
 LOGICAL,           OPTIONAL, INTENT(IN) :: LD_ABORT_IFNOT_REPROD, LD_OPENMP

 INTEGER(KIND=JPIM) :: INPROC,MYPROC,IN,ITAG,I,J,IR
 INTEGER(KIND=JPIM), ALLOCATABLE :: ICOUNT(:),IND(:),IREQ(:)
 REAL(KIND=JPRB), ALLOCATABLE :: ZBUFF(:),ZIN(:),ZOUT(:)
 REAL(KIND=JPRB) :: ZDUM(2)

 LOGICAL :: LLABORT, LL_OPENMP
 REAL(KIND=JPRB) :: ZHOOK_HANDLE

 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_ALLREDUCE', &
                       &  0,zhook_handle)
 #ifdef SFX_MPI
 inproc = mpl_nproc()
 #else
 inproc = 1
 #endif
 IF (PRESENT(ld_abort_ifnot_reprod)) THEN
   llabort = ld_abort_ifnot_reprod
 ELSE
   llabort = .true.
 ENDIF

 IF (PRESENT(ld_openmp)) THEN
   ll_openmp = ld_openmp
 ELSE
   ll_openmp = .true.
 ENDIF

 IF( SIZE(pin) /= SIZE(pout) )THEN
 #ifdef SFX_MPI
   CALL mpl_message (cdmessage='SIZE(PIN) /= SIZE(POUT)', &
                   & cdstring='ORDER_INDEP_ALLREDUCE',ldabort=.true.)
 #endif
 ENDIF

 in = SIZE(pin)

 IF (inproc==1) THEN
   pout(:)=pin(:)
 ELSE
   itag=1234
   ALLOCATE(icount(inproc))
   icount(:) = 0

 ! Determine distribution of input array over tasks

   DO j=1,in
     i=mod(j-1,inproc)+1
     icount(i)=icount(i)+1
   ENDDO
   ALLOCATE(ind(inproc))
   ind(:)=0
   ind(1)=1
   DO j=2,inproc
     ind(j)=ind(j-1)+icount(j-1)
   ENDDO
 #ifdef SFX_MPI
   myproc = mpl_myrank()
 #else
   myproc = 0
 #endif
   ALLOCATE(zbuff(icount(myproc)*inproc))
   ALLOCATE(ireq(2*inproc))
   ALLOCATE(zin(inproc))

 ! Distribute input array over tasks

   ir=0
   IF(icount(myproc) /= 0)THEN
     DO j=1,inproc
       ir=ir+1
 #ifdef SFX_MPI
       CALL mpl_recv (zbuff((j-1)*icount(myproc)+1:j*icount(myproc)),&
                     &ksource=j,&
                     &ktag=itag,&
                     &kmp_type=jp_non_blocking_standard,&
                     &krequest=ireq(ir),&
                     &cdstring='ORDER_INDEP_ALLREDUCE')
 #endif
     ENDDO
   ENDIF
   DO j=1,inproc
     IF(icount(j) /= 0)THEN
       ir=ir+1
 #ifdef SFX_MPI
       CALL mpl_send(pin(ind(j):ind(j)+icount(j)-1),&
                    &kdest=j,&
                    &ktag=itag,&
                    &kmp_type=jp_non_blocking_standard,&
                    &krequest=ireq(ir),&
                    &cdstring='ORDER_INDEP_ALLREDUCE')
 #endif
     ENDIF
   ENDDO
   IF(ir > 0)THEN
 #ifdef SFX_MPI
     CALL mpl_wait(zdum,krequest=ireq(1:ir),&
                  &cdstring='ORDER_INDEP_ALLREDUCE')
 #endif
   ENDIF

 ! Perform local order independent sums for myproc's part of input array

   ALLOCATE(zout(in))
   DO j=1,icount(myproc)
     DO i=1,inproc
       zin(i)=zbuff((i-1)*icount(myproc)+j)
     ENDDO
     zout(j)=order_indep_local_sum(zin,llabort,ll_openmp)
   ENDDO

 ! Gather results of order independent sums over tasks
 #ifdef SFX_MPI
   CALL mpl_allgatherv (zout(1:icount(myproc)),pout,icount, &
                      & cdstring='ORDER_INDEP_ALLREDUCE')
 #endif
   DEALLOCATE (icount)
   DEALLOCATE (ind)
   DEALLOCATE (ireq)
   DEALLOCATE (zbuff)
   DEALLOCATE (zin)
   DEALLOCATE (zout)

 ENDIF

 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_ALLREDUCE', &
                       &  1,zhook_handle)

 END SUBROUTINE order_indep_allreduce

 FUNCTION order_indep_dot_product (P1,P2,PW,KNG,LD_ABORT_IFNOT_REPROD, &
                                 & LD_OPENMP)

 !-----------------------------------------------------------------
 !
 !  Returns an accurate global sum of the elements of P1*P2, or
 !  P1*P2*PW. The result is identical to the result that would be
 !  obtained by the following:
 !
 !  IF (PRESENT(PW)) THEN
 !    PTEMP(:) = P1(:)*P2(:)*PW(:)
 !  ELSE
 !    PTEMP(:) = P1(:)*P2(:)
 !  ENDIF
 !  CALL ORDER_INDEP_GLOBAL_SUM (PTEMP,KNG,LD_ABORT_IFNOT_REPROD, &
 !                              & LD_OPENMP)
 !
 !   This routine is provided only because the above is not very
 !   cache-friendly.
 !
 !  NB: P1, P2 and PW are unmodified on return
 !
 ! Arguments:
 ! ----------
 !
 ! Required:
 !
 !   P1,P2                - INTENT(IN) - Arrays whose inner product is
 !                                       to be calculated
 !
 !
 ! Optional:
 !
 !   PW                   - INTENT(IN) - Weight array defining the
 !                                       metric for the inner product.
 !
 !   KNG                  - INTENT(IN) - Global length of array.
 !
 !   LD_ABORT_IFNOT_REPRO - INTENT(IN) - Defines behaviour in case
 !                                       a reproducible result cannot
 !                                       be guaranteed.
 !
 !   LD_OPENMP            - INTENT(IN) - Use OpenMP parallelization.
 !
 !
 ! Algorithm:
 ! ----------
 !
 !  The algorithm is based on Ogita et al. (2005) SIAM J. Sci. Computing,
 !  Vol.26, No.6, pp1955-1988. This is based in turn on an algorithm
 !  by Knuth (1969, seminumerical algorithms).
 !
 !  This version adds a second layer of parallelism on top of that
 !  provided by COMPENSATED_SUM_OMP. It iterates the compensated
 !  summation until the result is guaranteed to be within 4*eps
 !  of the true sum. It then rounds the result to the nearest
 !  floating-point number whose last three bits are zero, thereby
 !  guaranteeing an order-independent result.
 !
 !  Author: Mike Fisher ECMWF 2006/02/08
 !
 !-----------------------------------------------------------------

 IMPLICIT NONE

 REAL(KIND=JPRB) :: ORDER_INDEP_DOT_PRODUCT

 REAL(KIND=JPRB),             INTENT(IN) :: P1(:), P2(:)
 REAL(KIND=JPRB),   OPTIONAL, INTENT(IN) :: PW(:)
 INTEGER(KIND=JPIM),OPTIONAL, INTENT(IN) :: KNG
 LOGICAL,           OPTIONAL, INTENT(IN) :: LD_ABORT_IFNOT_REPROD, LD_OPENMP

 INTEGER(KIND=JPIM) :: J,IN,ING,INPROC
 REAL(KIND=JPRB) :: ZCORR,ZERR,ZOLDERR,ZBUFFL(3),ZBETA,ZRES
 REAL(KIND=JPRB), ALLOCATABLE :: ZPSUMS(:),ZPERRS(:),ZPCORS(:), &
                               & ZBUFFG(:),ZP(:)
 INTEGER(KIND=JPIM), ALLOCATABLE :: IRECVCOUNTS(:)
 LOGICAL :: LLABORT, LL_OPENMP, LL_FIRST_ITER
 REAL(KIND=JPRB) :: ZHOOK_HANDLE

 INTEGER(KIND=JPIM), SAVE :: INMSG=0

 INTEGER(KIND=JPIM), EXTERNAL :: N_PRECISION

 IF (lhook) CALL dr_hook ( &
               &'ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_DOT_PRODUCT', &
               &  0,zhook_handle)
 #ifdef SFX_MPI
 inproc = mpl_nproc()
 #else
 inproc = 1
 #endif
 IF (PRESENT(ld_abort_ifnot_reprod)) THEN
   llabort = ld_abort_ifnot_reprod
 ELSE
   llabort = .true.
 ENDIF

 IF (PRESENT(ld_openmp)) THEN
   ll_openmp = ld_openmp
 ELSE
   ll_openmp = .true.
 ENDIF

 in = SIZE(p1)

 IF (SIZE(p2)/=in) THEN
 #ifdef SFX_MPI
   CALL mpl_message (cdmessage='SIZE(P2)/=SIZE(P1)', &
                   & cdstring='ORDER_INDEP_DOT_PRODUCT',ldabort=.true.)
 #endif
 ENDIF

 IF (PRESENT(pw)) THEN
   IF (SIZE(pw)/=in) THEN
 #ifdef SFX_MPI
     CALL mpl_message (cdmessage='SIZE(PW)/=SIZE(P1)', &
                     & cdstring='ORDER_INDEP_DOT_PRODUCT',ldabort=.true.)
 #endif
   ENDIF
 ENDIF

 !--- global length of vector (needed for error bound calculation)

 IF (.NOT.PRESENT(kng)) THEN
   ing = in
   IF (inproc>1) THEN
 #ifdef SFX_MPI
     CALL mpl_allreduce (ing,'SUM',cdstring='ORDER_INDEP_DOT_PRODUCT')
 #endif
   ENDIF
 ELSE
   ing = kng
   IF (kng<in) THEN
 #ifdef SFX_MPI
     CALL mpl_message (cdmessage='Specified KNG < SIZE(PIN)', &
                     & cdstring='ORDER_INDEP_DOT_PRODUCT',ldabort=.true.)
 #endif
   ENDIF
 ENDIF

 IF (REAL(2*ing,jprb)*EPSILON(zres) >= 1.0) then
 #ifdef SFX_MPI
   CALL mpl_message (cdmessage='n is too large to guarantee error bounds', &
                   & cdstring='ORDER_INDEP_DOT_PRODUCT',ldabort=.true.)
 #endif
 ENDIF

 ALLOCATE (zp(max(in,1_jpim)))
 ALLOCATE (zbuffg(inproc*SIZE(zbuffl)))
 ALLOCATE (zpsums(inproc))
 ALLOCATE (zperrs(inproc))
 ALLOCATE (zpcors(inproc))
 ALLOCATE (irecvcounts(inproc))

 zolderr = huge(zerr)

 !--- Copy the input array. This avoids some tricky indexing, at the
 !--- expense of some inefficency.

 IF (in==0) THEN
   zp(1) = 0.0_jprb
 ENDIF

 ll_first_iter = .true.

 k_loop: DO

 !--- transform local arrays

   IF (in>0) THEN
     IF (ll_first_iter) THEN
       IF (PRESENT(pw)) THEN
         IF (ll_openmp) THEN
           CALL compensated_dot_product_omp (p1,p2,pw,zp,in,zcorr,zerr)
         ELSE
           CALL compensated_dot_product     (p1,p2,pw,zp,in,zcorr,zerr)
         ENDIF
       ELSE
         IF (ll_openmp) THEN
           CALL compensated_dot_product_omp (p1=p1,p2=p2,pout=zp, &
                                           & kn=in,pcorr=zcorr,perr=zerr)
         ELSE
           CALL compensated_dot_product     (p1=p1,p2=p2,pout=zp, &
                                           & kn=in,pcorr=zcorr,perr=zerr)
         ENDIF
       ENDIF
     ELSE
       IF (ll_openmp) THEN
         CALL compensated_sum_omp (zp,in,zcorr,zerr)
       ELSE
         CALL compensated_sum     (zp,in,zcorr,zerr)
       ENDIF
     ENDIF
   ENDIF

 !--- gather partial sums and error bounds to all processors

   zbuffl(1) = zp(max(in,1_jpim))

   IF (in>0) THEN
     zbuffl(2) = zerr
     zbuffl(3) = zcorr
   ELSE
     zbuffl(2) = 0.0_jprb
     zbuffl(3) = 0.0_jprb
   ENDIF

   IF (inproc>1) THEN
 !-- could use MPL_ALLGATHER here, if it existed!

     irecvcounts(:) = SIZE(zbuffl)
 #ifdef SFX_MPI
     CALL mpl_allgatherv (zbuffl,zbuffg,irecvcounts, &
                        & cdstring='ORDER_INDEP_DOT_PRODUCT')
 #endif
     DO j=1,inproc
       zpsums(j) = zbuffg(1+(j-1)*SIZE(zbuffl))
       zperrs(j) = zbuffg(2+(j-1)*SIZE(zbuffl))
       zpcors(j) = zbuffg(3+(j-1)*SIZE(zbuffl))
     ENDDO
   ELSE
     zpsums(1) = zbuffl(1)
     zperrs(1) = zbuffl(2)
     zpcors(1) = zbuffl(3)
   ENDIF

 !--- transform partial sums

   CALL compensated_sum (zpsums,inproc,zcorr,zerr)
   zerr  = zerr  + sum(zperrs)
   zcorr = zcorr + sum(zpcors)

 !--- Calculate final result

   zres = zpsums(inproc) + zcorr

 !--- Calculate error bound. This is corollary 4.7 from Ogita et al. (2005)

   zbeta = zerr*(REAL(2*ing,jprb)*EPSILON(zres)) &
        & /(1.0_JPRB - REAL(2*ING,JPRB)*EPSILON(ZRES))

   zerr = epsilon(zres)*abs(zres) &
      & +(zbeta + ( 2.0_jprb*epsilon(zres)*epsilon(zres)*abs(zres) &
      &            +3.0_jprb*tiny(zres)))

 !--- update the last element of the local array
 #ifdef SFX_MPI
   zp(max(in,1_jpim)) = zpsums(mpl_myrank())
 #endif

 !--- exit if the global error is small enough

   IF (zerr<4.0_jprb*spacing(zres)) EXIT k_loop

 !--- Take appropriate action if ZRES cannot be sufficiently refined.

   IF (zerr >= zolderr) THEN
     inmsg=inmsg+1

     IF (inmsg<=100) THEN
 #ifdef SFX_MPI
       CALL mpl_message ( &
           & cdmessage= 'ORDER_INDEP_DOT_PRODUCT: FALIED TO REFINE SUM', &
           & cdstring='ORDER_INDEP_DOT_PRODUCT')
       CALL mpl_message ( &
           & cdmessage='WARNING: POSSIBLITY OF NON-REPRODUCIBLE RESULTS',&
           & cdstring='ORDER_INDEP_DOT_PRODUCT')
 #endif
     ENDIF

     IF (inmsg==100) THEN
 #ifdef SFX_MPI
       CALL mpl_message ( &
        & cdmessage='ORDER_INDEP_DOT_PRODUCT: INMSG>100. OUTPUT SUPPRESSED',&
        & cdstring='ORDER_INDEP_DOT_PRODUCT')
 #endif
     ENDIF

     IF (llabort) THEN
 #ifdef SFX_MPI
       CALL mpl_message (cdmessage= &
                       & 'ABORT BECAUSE LD_ABORT_IFNOT_REPROD WAS SET', &
                       & cdstring='ORDER_INDEP_DOT_PRODUCT',ldabort=.true.)
 #endif
     ENDIF
   ENDIF

   zolderr = zerr

   ll_first_iter = .false.
 ENDDO k_loop

 !--- At this stage, we have guaranteed that ZRES less than 4*EPS
 !--- away from the exact sum. There are only four floating point
 !--- numbers in this range. So, if we find the nearest number that
 !--- has its last three bits zero, then we have a reproducible result.

 order_indep_dot_product = round(zres)

 DEALLOCATE (irecvcounts)
 DEALLOCATE (zpcors)
 DEALLOCATE (zperrs)
 DEALLOCATE (zpsums)
 DEALLOCATE (zbuffg)
 DEALLOCATE (zp)

 IF (lhook) CALL dr_hook ( &
          &'ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_DOT_PRODUCT', &
          &  1,zhook_handle)

 END FUNCTION order_indep_dot_product

 FUNCTION round (PRES)

 !-----------------------------------------------------------------
 !
 !  Returns the value of PRES rounded to the nearest floating-point
 !  number that has its last three bits zero

 !  The code to do this in Fortran is not nice, because Fortran
 !  does not proved access to the binary representation for REALs.
 !  Perhaps we should code it in c?

 !  This works on big-endian and little-endian machines.

 !  Author: Mike Fisher ECMWF 2006/02/08
 !
 !-----------------------------------------------------------------

 IMPLICIT NONE

 REAL(KIND=JPRB), INTENT(IN) :: PRES
 REAL(KIND=JPRB) :: ROUND

 INTEGER(KIND=JPIM) :: II(2),IEQUIV(8),INTS_PER_REAL,J,I_LOW_WORD
 REAL(KIND=JPRB)    :: ZZ(2),ZUP,ZDOWN

 INTEGER(KIND=JPIM), EXTERNAL :: N_PRECISION


 ii(:)=1
 zz(:)=1.0_jprb
 ints_per_real=n_precision(zz)/n_precision(ii)

 IF (ints_per_real>SIZE(iequiv)) THEN
 #ifdef SFX_MPI
   CALL mpl_message (cdmessage='INTS_PER_REAL>SIZE(IEQUIV)', &
                     & cdstring='ORDER_INDEP_GLOBAL_SUM',ldabort=.true.)
 #endif
 ENDIF

 !--- Test whether big-endian or little-endian

 zup = -1.0_jprb
 iequiv(1:ints_per_real) = transfer(zup,iequiv(1:ints_per_real))

 IF (iequiv(1)==0) THEN
   i_low_word = 1                ! Little-endian
 ELSE
   i_low_word = ints_per_real    ! Big-endian
 ENDIF

 !--- Find the nearest number with all 3 lowest-order bits zeroed

 iequiv(1:ints_per_real) = transfer(pres,iequiv(1:ints_per_real))
 zup    = pres
 zdown  = pres

 IF (ibits(iequiv(i_low_word),0,3)/=0) THEN
   DO j=1,4
     zup=nearest(zup,1.0_jprb)
     iequiv(1:ints_per_real) = transfer(zup,iequiv(1:ints_per_real))
     IF (ibits(iequiv(i_low_word),0,3)==0) EXIT

     zdown=nearest(zdown,-1.0_jprb)
     iequiv(1:ints_per_real) = transfer(zdown,iequiv(1:ints_per_real))
     IF (ibits(iequiv(i_low_word),0,3)==0) EXIT
   ENDDO

   IF (ibits(iequiv(i_low_word),0,3)/=0) THEN
 #ifdef SFX_MPI
     CALL mpl_message (cdmessage='THIS IS NOT POSSIBLE', &
                     & cdstring='ORDER_INDEP_GLOBAL_SUM',ldabort=.true.)
 #endif
   ENDIF
 ENDIF

 round = transfer(iequiv(1:ints_per_real),pres)

 END FUNCTION round

 END MODULE order_independent_summation_mod
compensated_summation_mod::compensated_sum
Definition: compensated_summation_mod.F90:111

order_independent_summation_mod::round
real(kind=jprb) function round(PRES)
Definition: order_independent_summation_mod.F90:1289

parkind1::jpim
integer, parameter jpim
Definition: parkind1.F90:13

yomhook::dr_hook
Definition: yomhook.F90:20

order_independent_summation_mod::order_indep_local_sum
Definition: order_independent_summation_mod.F90:73

order_independent_summation_mod::order_indep_allreduce
Definition: order_independent_summation_mod.F90:85

compensated_summation_mod
Definition: compensated_summation_mod.F90:1

compensated_summation_mod::compensated_dot_product
Definition: compensated_summation_mod.F90:119

order_independent_summation_mod::order_indep_global_sum2
Definition: order_independent_summation_mod.F90:81

parkind1::jprb
integer, parameter jprb
Definition: parkind1.F90:32

compensated_summation_mod::compensated_sum_omp
Definition: compensated_summation_mod.F90:115

sum
intent(out) overrides sub arrays one Sort by the least significant key first sum(iindex(1:n))

yomhook::lhook
logical lhook
Definition: yomhook.F90:15

order_independent_summation_mod::order_indep_dot_product
Definition: order_independent_summation_mod.F90:89

parkind1
Definition: parkind1.F90:1

compensated_summation_mod::compensated_dot_product_omp
Definition: compensated_summation_mod.F90:123

order_independent_summation_mod
Definition: order_independent_summation_mod.F90:1

mpl_module
Definition: mpl_module.F90:1

yomhook
Definition: yomhook.F90:1

order_independent_summation_mod::order_indep_global_sum
Definition: order_independent_summation_mod.F90:77