SURFEX v8.1
General documentation of Surfex
order_independent_summation_mod.F90
Go to the documentation of this file.
2 
3 !**** ORDER_INDEPENDENT_SUMMATION_MOD
4 
5 ! Purpose.
6 ! --------
7 ! Functions to perform global (over all processors) and
8 ! local (per-processor) order-independent accurate summation
9 ! and order-independent inner products.
10 
11 !** Interface.
12 ! ----------
13 
14 ! result = ORDER_INDEP_GLOBAL_SUM (P1)
15 ! result = ORDER_INDEP_LOCAL_SUM (P1)
16 ! result = ORDER_INDEP_DOT_PRODUCT (P1,P2,PW)
17 
18 ! Input required arguments :
19 ! -------------------------
20 ! P1 - A 1d array of KIND=JPRB reals
21 ! P2 - A 1d array of KIND=JPRB reals. Same length as P1.
22 
23 ! Input optional arguments :
24 ! -------------------------
25 ! PW - A 1d array of KIND=JPRB reals. Same length as P1.
26 ! If specified. ORDER_INDEP_DOT_PRODUCT returns
27 ! a weighted inner product. PW defines the weights.
28 ! If not specified, ORDER_INDEP_DOT_PRODUCT returns
29 ! an unweighted dot product of P and P2.
30 !
31 ! KNG (global sum only) - Global length of input array.
32 !
33 ! LD_ABORT_IFNOT_REPROD - Abort if results are not guaranteed
34 ! to be bit-reproducible. (Default=T)
35 !
36 ! LD_OPENMP - .TRUE. => use openMP (Default is
37 ! .TRUE. for global sum, .FALSE. for
38 ! local sum.)
39 
40 ! Output required arguments :
41 ! -------------------------
42 ! none
43 
44 ! Author.
45 ! -------
46 ! Mike Fisher ECMWF
47 
48 ! Modifications.
49 ! --------------
50 ! Original: 2006-20-22
51 
52 ! ------------------------------------------------------------------
53 
54 USE parkind1 ,ONLY : jpim ,jprb
58 #ifdef SFX_MPI
59 USE mpl_module, ONLY : mpl_allreduce, mpl_allgatherv, mpl_myrank, mpl_nproc, &
60  & mpl_message, mpl_send, mpl_recv, mpl_wait, &
61  & jp_non_blocking_standard
62 #endif
63 USE yomhook , ONLY : lhook, dr_hook
64 
65 SAVE
66 PRIVATE
67 PUBLIC order_indep_local_sum, &
72 
74  MODULE PROCEDURE order_indep_local_sum
75 END INTERFACE
76 
78  MODULE PROCEDURE order_indep_global_sum
79 END INTERFACE
80 
82  MODULE PROCEDURE order_indep_global_sum2
83 END INTERFACE
84 
86  MODULE PROCEDURE order_indep_allreduce
87 END INTERFACE
88 
90  MODULE PROCEDURE order_indep_dot_product
91 END INTERFACE
92 
93 CONTAINS
94 
95 FUNCTION order_indep_local_sum (PIN,LD_ABORT_IFNOT_REPROD,LD_OPENMP)
96 
97 !-----------------------------------------------------------------
98 ! Returns an accurate local (i.e. on a single processor) sum of
99 ! the elements of PIN. The sum is bit-reproducible for any
100 ! ordering of the elements of PIN.
101 !
102 ! NB: PIN is unmodified on return
103 !
104 ! Algorithm:
105 ! ----------
106 !
107 ! The algorithm is based on Ogita et al. (2005) SIAM J. Sci. Computing,
108 ! Vol.26, No.6, pp1955-1988. This is based in turn on an algorithm
109 ! by Knuth (1969, seminumerical algorithms).
110 !
111 ! This version iterates the compensated sum algorithm until the
112 ! result is guaranteed to be within 4*eps of the true sum. It
113 ! then rounds the result to the nearest floating-point number
114 ! whose last three bits are zero, thereby guaranteeing an
115 ! order-independent result.
116 !
117 ! Author: Mike Fisher ECMWF 2006/02/08
118 !
119 !-----------------------------------------------------------------
120 
121 IMPLICIT NONE
122 
123 REAL(KIND=JPRB) :: ORDER_INDEP_LOCAL_SUM
124 REAL(KIND=JPRB), INTENT(IN) :: PIN(:)
125 LOGICAL,OPTIONAL,INTENT(IN) :: LD_ABORT_IFNOT_REPROD, LD_OPENMP
126 
127 INTEGER(KIND=JPIM) :: IN
128 REAL(KIND=JPRB) :: ZCORR,ZERR,ZOLDERR,ZBETA,ZRES
129 REAL(KIND=JPRB), ALLOCATABLE :: ZP(:)
130 LOGICAL :: LLABORT, LL_OPENMP
131 REAL(KIND=JPRB) :: ZHOOK_HANDLE
132 
133 INTEGER(KIND=JPIM), SAVE :: INMSG=0
134 
135 INTEGER(KIND=JPIM), EXTERNAL :: N_PRECISION
136 
137 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_LOCAL_SUM', &
138  & 0,zhook_handle)
139 
140 IF (PRESENT(ld_abort_ifnot_reprod)) THEN
141  llabort = ld_abort_ifnot_reprod
142 ELSE
143  llabort = .true.
144 ENDIF
145 
146 IF (PRESENT(ld_openmp)) THEN
147  ll_openmp = ld_openmp
148 ELSE
149  ll_openmp = .false.
150 ENDIF
151 
152 in = SIZE(pin)
153 
154 IF (REAL(2*in,jprb)*EPSILON(zres) >= 1.0) then
155 #ifdef SFX_MPI
156  CALL mpl_message (cdmessage='n is too large to guarantee error bounds', &
157  & cdstring='ORDER_INDEP_LOCAL_SUM',ldabort=.true.)
158 #endif
159 ENDIF
160 
161 test_array_length: IF (in>0) THEN
162  zolderr = huge(zerr)
163 
164 !--- Copy the input array. This avoids some tricky indexing, at the
165 !--- expense of some inefficency.
166 
167  ALLOCATE (zp(in))
168  zp(:) = pin(:)
169 
170  k_loop: DO
171 
172 !--- transform local arrays
173 
174  IF (ll_openmp) THEN
175  CALL compensated_sum_omp (zp,in,zcorr,zerr)
176  ELSE
177  CALL compensated_sum (zp,in,zcorr,zerr)
178  ENDIF
179 
180 !--- Calculate final result
181 
182  zres = zp(in) + zcorr
183 
184 !--- Calculate error bound. This is corollary 4.7 from Ogita et al. (2005)
185 
186  zbeta = zerr*(REAL(2*in,jprb)*EPSILON(zres)) &
187  & /(1.0_JPRB - REAL(2*IN,JPRB)*EPSILON(ZRES))
188 
189  zerr = epsilon(zres)*abs(zres) &
190  & +(zbeta + ( 2.0_jprb*epsilon(zres)*epsilon(zres)*abs(zres) &
191  & +3.0_jprb*tiny(zres)))
192 
193 !--- exit if the error is small enough
194 
195  IF (zerr<4.0_jprb*spacing(zres)) EXIT k_loop
196 
197 !--- Take appropriate action if ZRES cannot be sufficiently refined.
198 
199  IF (zerr >= zolderr) THEN
200  inmsg=inmsg+1
201 
202  IF (inmsg<=100) THEN
203 #ifdef SFX_MPI
204  CALL mpl_message ( &
205  & cdmessage= 'ORDER_INDEP_LOCAL_SUM: FALIED TO REFINE SUM', &
206  & cdstring='ORDER_INDEP_LOCAL_SUM')
207  CALL mpl_message ( &
208  & cdmessage='WARNING: POSSIBLITY OF NON-REPRODUCIBLE RESULTS',&
209  & cdstring='ORDER_INDEP_LOCAL_SUM')
210 #endif
211  ENDIF
212 
213  IF (inmsg==100) THEN
214 #ifdef SFX_MPI
215  CALL mpl_message ( &
216  & cdmessage='ORDER_INDEP_LOCAL_SUM: INMSG>100. OUTPUT SUPPRESSED',&
217  & cdstring='ORDER_INDEP_LOCAL_SUM')
218 #endif
219  ENDIF
220 
221  IF (llabort) THEN
222 #ifdef SFX_MPI
223  CALL mpl_message (cdmessage= &
224  & 'ABORT BECAUSE LD_ABORT_IFNOT_REPROD WAS SET', &
225  & cdstring='ORDER_INDEP_LOCAL_SUM',ldabort=.true.)
226 #endif
227  ENDIF
228  ENDIF
229 
230  zolderr = zerr
231 
232  ENDDO k_loop
233 
234 !--- At this stage, we have guaranteed that ZRES is less than 4*EPS
235 !--- away from the exact sum. There are only eight floating point
236 !--- numbers in this range. So, if we find the nearest number that
237 !--- has its last three bits zero, then we have a reproducible result.
238 
240 
241  DEALLOCATE (zp)
242 ELSE test_array_length
243 
244  order_indep_local_sum = 0.0_jprb
245 
246 ENDIF test_array_length
247 
248 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_LOCAL_SUM', &
249  & 1,zhook_handle)
250 
251 END FUNCTION order_indep_local_sum
252 
253 FUNCTION order_indep_global_sum (PIN,KNG,LD_ABORT_IFNOT_REPROD, LD_OPENMP)
255 !-----------------------------------------------------------------
256 !
257 ! Returns an accurate global sum of the elements of PIN. The
258 ! sum is bit-reproducible for any distribution of PIN over
259 ! threads and tasks, and is independent of the ordering of the
260 ! elements of PIN.
261 !
262 ! NB: PIN is unmodified on return
263 !
264 ! Arguments:
265 ! ----------
266 !
267 ! Required:
268 !
269 ! PIN - INTENT(IN) - The array to be summed.
270 !
271 !
272 ! Optional:
273 !
274 ! KNG - INTENT(IN) - Global length of array.
275 !
276 ! LD_ABORT_IFNOT_REPROD - INTENT(IN) - Defines behaviour in case
277 ! a reproducible result cannot
278 ! be guaranteed.
279 !
280 ! LD_OPENMP - INTENT(IN) - Use OpenMP parallelization.
281 !
282 !
283 ! Algorithm:
284 ! ----------
285 !
286 ! The algorithm is based on Ogita et al. (2005) SIAM J. Sci. Computing,
287 ! Vol.26, No.6, pp1955-1988. This is based in turn on an algorithm
288 ! by Knuth (1969, seminumerical algorithms).
289 !
290 ! This version adds a second layer of parallelism on top of that
291 ! provided by COMPENSATED_SUM_OMP. It iterates the compensated
292 ! summation until the result is guaranteed to be within 4*eps
293 ! of the true sum. It then rounds the result to the nearest
294 ! floating-point number whose last three bits are zero, thereby
295 ! guaranteeing an order-independent result.
296 !
297 ! Author: Mike Fisher ECMWF 2006/02/08
298 !
299 !-----------------------------------------------------------------
300 
301 IMPLICIT NONE
302 
303 REAL(KIND=JPRB) :: ORDER_INDEP_GLOBAL_SUM
304 
305 REAL(KIND=JPRB), INTENT(IN) :: PIN(:)
306 INTEGER(KIND=JPIM),OPTIONAL, INTENT(IN) :: KNG
307 LOGICAL, OPTIONAL, INTENT(IN) :: LD_ABORT_IFNOT_REPROD, LD_OPENMP
308 
309 INTEGER(KIND=JPIM) :: J,IN,ING,INPROC
310 REAL(KIND=JPRB) :: ZCORR,ZERR,ZOLDERR,ZBUFFL(3),ZBETA,ZRES
311 REAL(KIND=JPRB), ALLOCATABLE :: ZPSUMS(:),ZPERRS(:),ZPCORS(:), &
312  & ZBUFFG(:),ZP(:)
313 INTEGER(KIND=JPIM), ALLOCATABLE :: IRECVCOUNTS(:)
314 LOGICAL :: LLABORT, LL_OPENMP
315 REAL(KIND=JPRB) :: ZHOOK_HANDLE
316 
317 INTEGER(KIND=JPIM), SAVE :: INMSG=0
318 
319 INTEGER(KIND=JPIM), EXTERNAL :: N_PRECISION
320 
321 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_GLOBAL_SUM', &
322  & 0,zhook_handle)
323 
324 #ifdef SFX_MPI
325 inproc = mpl_nproc()
326 #else
327 inproc = 1
328 #endif
329 IF (PRESENT(ld_abort_ifnot_reprod)) THEN
330  llabort = ld_abort_ifnot_reprod
331 ELSE
332  llabort = .true.
333 ENDIF
334 
335 IF (PRESENT(ld_openmp)) THEN
336  ll_openmp = ld_openmp
337 ELSE
338  ll_openmp = .true.
339 ENDIF
340 
341 in = SIZE(pin)
342 
343 !--- global length of vector (needed for error bound calculation)
344 
345 IF (.NOT.PRESENT(kng)) THEN
346  ing = in
347  IF (inproc>1) THEN
348 #ifdef SFX_MPI
349  CALL mpl_allreduce (ing,'SUM',cdstring='ORDER_INDEP_GLOBAL_SUM')
350 #endif
351  ENDIF
352 ELSE
353  ing = kng
354  IF (kng<in) THEN
355 #ifdef SFX_MPI
356  CALL mpl_message (cdmessage='Specified KNG < SIZE(PIN)', &
357  & cdstring='ORDER_INDEP_GLOBAL_SUM',ldabort=.true.)
358 #endif
359  ENDIF
360 ENDIF
361 
362 IF (REAL(2*ing,jprb)*EPSILON(zres) >= 1.0) then
363 #ifdef SFX_MPI
364  CALL mpl_message (cdmessage='n is too large to guarantee error bounds', &
365  & cdstring='ORDER_INDEP_GLOBAL_SUM',ldabort=.true.)
366 #endif
367 ENDIF
368 
369 ALLOCATE (zp(max(in,1_jpim)))
370 ALLOCATE (zbuffg(inproc*SIZE(zbuffl)))
371 ALLOCATE (zpsums(inproc))
372 ALLOCATE (zperrs(inproc))
373 ALLOCATE (zpcors(inproc))
374 ALLOCATE (irecvcounts(inproc))
375 
376 zolderr = huge(zerr)
377 
378 !--- Copy the input array. This avoids some tricky indexing, at the
379 !--- expense of some inefficency.
380 
381 IF (in>0) THEN
382  zp(:) = pin(:)
383 ELSE
384  zp(1) = 0.0_jprb
385 ENDIF
386 
387 k_loop: DO
388 
389 !--- transform local arrays
390 
391  IF (in>0) THEN
392  IF (ll_openmp) THEN
393  CALL compensated_sum_omp (zp,in,zcorr,zerr)
394  ELSE
395  CALL compensated_sum (zp,in,zcorr,zerr)
396  ENDIF
397  ENDIF
398 
399 !--- gather partial sums and error bounds to all processors
400 
401  zbuffl(1) = zp(max(in,1_jpim))
402 
403  IF (in>0) THEN
404  zbuffl(2) = zerr
405  zbuffl(3) = zcorr
406  ELSE
407  zbuffl(2) = 0.0_jprb
408  zbuffl(3) = 0.0_jprb
409  ENDIF
410 
411  IF (inproc>1) THEN
412 !-- could use MPL_ALLGATHER here, if it existed!
413 
414  irecvcounts(:) = SIZE(zbuffl)
415 #ifdef SFX_MPI
416  CALL mpl_allgatherv (zbuffl,zbuffg,irecvcounts, &
417  & cdstring='ORDER_INDEP_GLOBAL_SUM')
418 #endif
419  DO j=1,inproc
420  zpsums(j) = zbuffg(1+(j-1)*SIZE(zbuffl))
421  zperrs(j) = zbuffg(2+(j-1)*SIZE(zbuffl))
422  zpcors(j) = zbuffg(3+(j-1)*SIZE(zbuffl))
423  ENDDO
424  ELSE
425  zpsums(1) = zbuffl(1)
426  zperrs(1) = zbuffl(2)
427  zpcors(1) = zbuffl(3)
428  ENDIF
429 
430 !--- transform partial sums
431 
432  CALL compensated_sum (zpsums,inproc,zcorr,zerr)
433  zerr = zerr + sum(zperrs)
434  zcorr = zcorr + sum(zpcors)
435 
436 !--- Calculate final result
437 
438  zres = zpsums(inproc) + zcorr
439 
440 !--- Calculate error bound. This is corollary 4.7 from Ogita et al. (2005)
441 
442  zbeta = zerr*(REAL(2*ing,jprb)*EPSILON(zres)) &
443  & /(1.0_JPRB - REAL(2*ING,JPRB)*EPSILON(ZRES))
444 
445  zerr = epsilon(zres)*abs(zres) &
446  & +(zbeta + ( 2.0_jprb*epsilon(zres)*epsilon(zres)*abs(zres) &
447  & +3.0_jprb*tiny(zres)))
448 
449 !--- update the last element of the local array
450 
451 #ifdef SFX_MPI
452  zp(max(in,1_jpim)) = zpsums(mpl_myrank())
453 #endif
454 
455 !--- exit if the global error is small enough
456 
457  IF (zerr<4.0_jprb*spacing(zres)) EXIT k_loop
458 
459 !--- Take appropriate action if ZRES cannot be sufficiently refined.
460 
461  IF (zerr >= zolderr) THEN
462  inmsg=inmsg+1
463 
464  IF (inmsg<=100) THEN
465 #ifdef SFX_MPI
466  CALL mpl_message ( &
467  & cdmessage= 'ORDER_INDEP_GLOBAL_SUM: FALIED TO REFINE SUM', &
468  & cdstring='ORDER_INDEP_GLOBAL_SUM')
469  CALL mpl_message ( &
470  & cdmessage='WARNING: POSSIBLITY OF NON-REPRODUCIBLE RESULTS',&
471  & cdstring='ORDER_INDEP_GLOBAL_SUM')
472 #endif
473  ENDIF
474 
475  IF (inmsg==100) THEN
476 #ifdef SFX_MPI
477  CALL mpl_message ( &
478  & cdmessage='ORDER_INDEP_GLOBAL_SUM: INMSG>100. OUTPUT SUPPRESSED',&
479  & cdstring='ORDER_INDEP_GLOBAL_SUM')
480 #endif
481  ENDIF
482 
483  IF (llabort) THEN
484 #ifdef SFX_MPI
485  CALL mpl_message (cdmessage= &
486  & 'ABORT BECAUSE LD_ABORT_IFNOT_REPROD WAS SET', &
487  & cdstring='ORDER_INDEP_GLOBAL_SUM',ldabort=.true.)
488 #endif
489  ENDIF
490  ENDIF
491 
492  zolderr = zerr
493 
494 ENDDO k_loop
495 
496 !--- At this stage, we have guaranteed that ZRES less than 4*EPS
497 !--- away from the exact sum. There are only four floating point
498 !--- numbers in this range. So, if we find the nearest number that
499 !--- has its last three bits zero, then we have a reproducible result.
500 
502 
503 DEALLOCATE (irecvcounts)
504 DEALLOCATE (zpcors)
505 DEALLOCATE (zperrs)
506 DEALLOCATE (zpsums)
507 DEALLOCATE (zbuffg)
508 DEALLOCATE (zp)
509 
510 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_GLOBAL_SUM', &
511  & 1,zhook_handle)
512 
513 END FUNCTION order_indep_global_sum
514 
515 SUBROUTINE order_indep_global_sum2 (PIN,POUT,KNVEC,KDIM,KNL,LD_ABORT_IFNOT_REPROD,LD_OPENMP)
517 !-----------------------------------------------------------------
518 !
519 ! This is a vector version of ORDER_INDEP_GLOBAL_SUM, which
520 ! returns a vector of accurate global sums of the elements of matrix
521 ! PIN along dimension KDIM. The sum is bit-reproducible for any
522 ! distribution of PIN over threads and tasks, and is independent of
523 ! the ordering of the elements of PIN.
524 !
525 ! NB: PIN is unmodified on return
526 !
527 ! Arguments:
528 ! ----------
529 !
530 ! Required:
531 !
532 ! PIN - INTENT(IN) - The array to be summed.
533 !
534 ! POUT - INTENT(OUT) - The vector with sums
535 !
536 ! KDIM - INTENT(IN) - The dimension to sum along.
537 !
538 ! KNL - INTENT(IN) - Local lengths of vectors.
539 !
540 !
541 ! Optional:
542 !
543 ! LD_ABORT_IFNOT_REPROD - INTENT(IN) - Defines behaviour in case
544 ! a reproducible result cannot
545 ! be guaranteed.
546 !
547 ! LD_OPENMP - INTENT(IN) - Use OpenMP parallelization.
548 !
549 !
550 ! Algorithm:
551 ! ----------
552 !
553 ! The algorithm is based on Ogita et al. (2005) SIAM J. Sci. Computing,
554 ! Vol.26, No.6, pp1955-1988. This is based in turn on an algorithm
555 ! by Knuth (1969, seminumerical algorithms).
556 !
557 ! This version adds a second layer of parallelism on top of that
558 ! provided by COMPENSATED_SUM_OMP. It iterates the compensated
559 ! summation until the result is guaranteed to be within 4*eps
560 ! of the true sum. It then rounds the result to the nearest
561 ! floating-point number whose last three bits are zero, thereby
562 ! guaranteeing an order-independent result.
563 !
564 ! Author: Tomas Wilhelmsson ECMWF 2010/03/30
565 !
566 !-----------------------------------------------------------------
567 
568 IMPLICIT NONE
569 
570 INTEGER(KIND=JPIM), INTENT(IN) :: KNVEC
571 REAL(KIND=JPRB), INTENT(IN) :: PIN(:,:)
572 REAL(KIND=JPRB), INTENT(OUT) :: POUT(knvec)
573 INTEGER(KIND=JPIM), INTENT(IN) :: KDIM
574 INTEGER(KIND=JPIM), INTENT(IN) :: KNL(knvec)
575 LOGICAL, OPTIONAL, INTENT(IN) :: LD_ABORT_IFNOT_REPROD, LD_OPENMP
576 
577 INTEGER(KIND=JPIM) :: J,JL,JP,IBUFLEN,INVEC,INPROC,ING(knvec)
578 REAL(KIND=JPRB), DIMENSION(KNVEC) :: ZCORR,ZERR,ZOLDERR,ZBETA,ZRES
579 REAL(KIND=JPRB), ALLOCATABLE :: ZPSUMS(:,:),ZPERRS(:,:),ZPCORS(:,:), &
580  & ZBUFFL(:),ZBUFFG(:),ZP(:,:)
581 INTEGER(KIND=JPIM), ALLOCATABLE :: IRECVCOUNTS(:)
582 LOGICAL :: LLABORT, LL_OPENMP, LLDONE(knvec)
583 REAL(KIND=JPRB) :: ZHOOK_HANDLE
584 
585 INTEGER(KIND=JPIM), SAVE :: INMSG=0
586 
587 INTEGER(KIND=JPIM), EXTERNAL :: N_PRECISION
588 
589 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_GLOBAL_SUM2', &
590  & 0,zhook_handle)
591 
592 IF (kdim<1 .OR. kdim>2) THEN
593 #ifdef SFX_MPI
594  CALL mpl_message (cdmessage='Invalid KDIM value', &
595  & cdstring='ORDER_INDEP_GLOBAL_SUM2',ldabort=.true.)
596 #endif
597 ENDIF
598 #ifdef SFX_MPI
599 inproc = mpl_nproc()
600 #else
601 inproc = 1
602 #endif
603 IF (PRESENT(ld_abort_ifnot_reprod)) THEN
604  llabort = ld_abort_ifnot_reprod
605 ELSE
606  llabort = .true.
607 ENDIF
608 
609 IF (PRESENT(ld_openmp)) THEN
610  ll_openmp = ld_openmp
611 ELSE
612  ll_openmp = .true.
613 ENDIF
614 
615 !--- global lengths of vectors (needed for error bound calculation)
616 
617 ing(:) = knl(:)
618 IF (inproc>1) THEN
619 #ifdef SFX_MPI
620  CALL mpl_allreduce (ing,'SUM',cdstring='ORDER_INDEP_GLOBAL_SUM2')
621 #endif
622 ENDIF
623 
624 IF (any(REAL(2*ING(:),JPRB)*epsilon(ZRES) >= 1.0)) then
625 #ifdef SFX_MPI
626  CALL mpl_message (cdmessage='n is too large to guarantee error bounds', &
627  & cdstring='ORDER_INDEP_GLOBAL_SUM2',ldabort=.true.)
628 #endif
629 ENDIF
630 
631 ibuflen=3
632 
633 ALLOCATE (zp(max(maxval(knl),1_jpim),knvec))
634 ALLOCATE (zbuffl(ibuflen*knvec))
635 ALLOCATE (zbuffg(inproc*ibuflen*knvec))
636 ALLOCATE (zpsums(inproc,knvec))
637 ALLOCATE (zperrs(inproc,knvec))
638 ALLOCATE (zpcors(inproc,knvec))
639 ALLOCATE (irecvcounts(inproc))
640 
641 zolderr(:) = huge(zerr)
642 lldone(:) = .false.
643 
644 !--- Copy the input array. This avoids some tricky indexing, at the
645 !--- expense of some inefficency.
646 
647 
648 DO j=1,knvec
649  IF (knl(j)>0) THEN
650  IF (kdim==1) zp(1:knl(j),j) = pin(1:knl(j),j)
651  IF (kdim==2) zp(1:knl(j),j) = pin(j,1:knl(j))
652  ELSE
653  zp(1,j) = 0.0_jprb
654  ENDIF
655 ENDDO
656 
657 k_loop: DO
658 
659 !--- transform local arrays
660 
661  jl=0
662  DO j=1,knvec
663  IF (knl(j)>0 .AND. .NOT. lldone(j)) THEN
664  IF (ll_openmp) THEN
665  CALL compensated_sum_omp (zp(:,j),knl(j),zcorr(j),zerr(j))
666  ELSE
667  CALL compensated_sum (zp(:,j),knl(j),zcorr(j),zerr(j))
668  ENDIF
669  ENDIF
670 
671 !--- gather partial sums and error bounds to all processors
672 
673  zbuffl(jl+1) = zp(max(knl(j),1_jpim),j)
674 
675  IF (knl(j)>0) THEN
676  zbuffl(jl+2) = zerr(j)
677  zbuffl(jl+3) = zcorr(j)
678  ELSE
679  zbuffl(jl+2) = 0.0_jprb
680  zbuffl(jl+3) = 0.0_jprb
681  ENDIF
682  jl = jl + ibuflen
683  ENDDO
684 
685  IF (inproc>1) THEN
686 !-- could use MPL_ALLGATHER here, if it existed!
687 
688  irecvcounts(:) = SIZE(zbuffl)
689 #ifdef SFX_MPI
690  CALL mpl_allgatherv (zbuffl,zbuffg,irecvcounts, &
691  & cdstring='ORDER_INDEP_GLOBAL_SUM2')
692 #endif
693  DO jp=1,inproc
694  jl = 0
695  DO j = 1,knvec
696  zpsums(jp,j) = zbuffg(jl+1+(jp-1)*SIZE(zbuffl))
697  zperrs(jp,j) = zbuffg(jl+2+(jp-1)*SIZE(zbuffl))
698  zpcors(jp,j) = zbuffg(jl+3+(jp-1)*SIZE(zbuffl))
699  jl = jl + ibuflen
700  ENDDO
701  ENDDO
702  ELSE
703  jl = 0
704  DO j = 1,knvec
705  zpsums(1,j) = zbuffl(jl+1)
706  zperrs(1,j) = zbuffl(jl+2)
707  zpcors(1,j) = zbuffl(jl+3)
708  ENDDO
709  ENDIF
710 
711 !--- transform partial sums
712 
713  DO j = 1,knvec
714  IF (lldone(j)) cycle
715 
716  CALL compensated_sum (zpsums(:,j),inproc,zcorr(j),zerr(j))
717  zerr(j) = zerr(j) + sum(zperrs(:,j))
718  zcorr(j) = zcorr(j) + sum(zpcors(:,j))
719 
720 !--- Calculate final result
721 
722  zres(j) = zpsums(inproc,j) + zcorr(j)
723 
724 !--- Calculate error bound. This is corollary 4.7 from Ogita et al. (2005)
725 
726  zbeta(j) = zerr(j)*(REAL(2*ING(J),jprb)*epsilon(zres(J))) &
727  & /(1.0_JPRB - REAL(2*ING(J),JPRB)*EPSILON(ZRES(J)))
728 
729  zerr(j) = epsilon(zres(j))*abs(zres(j)) &
730  & +(zbeta(j) + ( 2.0_jprb*epsilon(zres(j))*epsilon(zres(j))*abs(zres(j)) &
731  & +3.0_jprb*tiny(zres(j))))
732 
733 !--- update the last element of the local array
734 #ifdef SFX_MPI
735  zp(max(knl(j),1_jpim),j) = zpsums(mpl_myrank(),j)
736 #endif
737  ENDDO
738 
739 !--- exit if the global error is small enough
740 
741  lldone(:) = (zerr(:)<4.0_jprb*spacing(zres(:))) .OR. lldone(:)
742 
743  IF (all(lldone(:))) EXIT k_loop
744 
745 !--- Take appropriate action if ZRES cannot be sufficiently refined.
746 
747  DO j = 1,knvec
748  IF (zerr(j) >= zolderr(j) .AND. .NOT. lldone(j)) THEN
749  inmsg=inmsg+1
750 
751  IF (inmsg<=100) THEN
752 #ifdef SFX_MPI
753  CALL mpl_message ( &
754  & cdmessage= 'ORDER_INDEP_GLOBAL_SUM2: FALIED TO REFINE SUM', &
755  & cdstring='ORDER_INDEP_GLOBAL_SUM2')
756  CALL mpl_message ( &
757  & cdmessage='WARNING: POSSIBLITY OF NON-REPRODUCIBLE RESULTS',&
758  & cdstring='ORDER_INDEP_GLOBAL_SUM2')
759 #endif
760  ENDIF
761 
762  IF (inmsg==100) THEN
763 #ifdef SFX_MPI
764  CALL mpl_message ( &
765  & cdmessage='ORDER_INDEP_GLOBAL_SUM2: INMSG>100. OUTPUT SUPPRESSED',&
766  & cdstring='ORDER_INDEP_GLOBAL_SUM2')
767 #endif
768  ENDIF
769 
770  IF (llabort) THEN
771 #ifdef SFX_MPI
772  CALL mpl_message (cdmessage= &
773  & 'ABORT BECAUSE LD_ABORT_IFNOT_REPROD WAS SET', &
774  & cdstring='ORDER_INDEP_GLOBAL_SUM2',ldabort=.true.)
775 #endif
776  ENDIF
777  ENDIF
778 
779  zolderr(j) = zerr(j)
780  ENDDO
781 
782 ENDDO k_loop
783 
784 !--- At this stage, we have guaranteed that ZRES less than 4*EPS
785 !--- away from the exact sum. There are only four floating point
786 !--- numbers in this range. So, if we find the nearest number that
787 !--- has its last three bits zero, then we have a reproducible result.
788 
789 DO j=1,knvec
790  pout(j) = round(zres(j))
791 ENDDO
792 
793 DEALLOCATE (irecvcounts)
794 DEALLOCATE (zpcors)
795 DEALLOCATE (zperrs)
796 DEALLOCATE (zpsums)
797 DEALLOCATE (zbuffg)
798 DEALLOCATE (zbuffl)
799 DEALLOCATE (zp)
800 
801 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_GLOBAL_SUM2', &
802  & 1,zhook_handle)
803 
804 END SUBROUTINE order_indep_global_sum2
805 
806 SUBROUTINE order_indep_allreduce (PIN,POUT,LD_ABORT_IFNOT_REPROD,LD_OPENMP)
808 !-----------------------------------------------------------------
809 !
810 ! Returns in POUT an accurate global sum of the elements of PIN across tasks.
811 ! This has a similar functionality to MPL_allreduce where an array is supplied
812 ! and we want the individual elements of the array to be summed across tasks.
813 ! This is different to ORDER_INDEP_GLOBAL_SUM where PIN is considered part of
814 ! a global array.
815 !
816 ! Arguments:
817 ! ----------
818 !
819 ! Required:
820 !
821 ! PIN - INTENT(IN) - input array to be summed.
822 ! POUT - INTENT(OUT) - output array of same size
823 !
824 !
825 ! Optional:
826 !
827 ! LD_ABORT_IFNOT_REPROD - INTENT(IN) - Defines behaviour in case
828 ! a reproducible result cannot
829 ! be guaranteed.
830 !
831 ! LD_OPENMP - INTENT(IN) - Use OpenMP parallelization.
832 !
833 !
834 ! Author: George Mozdzynski ECMWF June 2009
835 !
836 !-----------------------------------------------------------------
837 
838 IMPLICIT NONE
839 
840 REAL(KIND=JPRB) :: ORDER_INDEP_GLOBAL_SUM
841 
842 REAL(KIND=JPRB), INTENT(IN) :: PIN(:)
843 REAL(KIND=JPRB), INTENT(OUT):: POUT(:)
844 LOGICAL, OPTIONAL, INTENT(IN) :: LD_ABORT_IFNOT_REPROD, LD_OPENMP
845 
846 INTEGER(KIND=JPIM) :: INPROC,MYPROC,IN,ITAG,I,J,IR
847 INTEGER(KIND=JPIM), ALLOCATABLE :: ICOUNT(:),IND(:),IREQ(:)
848 REAL(KIND=JPRB), ALLOCATABLE :: ZBUFF(:),ZIN(:),ZOUT(:)
849 REAL(KIND=JPRB) :: ZDUM(2)
850 
851 LOGICAL :: LLABORT, LL_OPENMP
852 REAL(KIND=JPRB) :: ZHOOK_HANDLE
853 
854 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_ALLREDUCE', &
855  & 0,zhook_handle)
856 #ifdef SFX_MPI
857 inproc = mpl_nproc()
858 #else
859 inproc = 1
860 #endif
861 IF (PRESENT(ld_abort_ifnot_reprod)) THEN
862  llabort = ld_abort_ifnot_reprod
863 ELSE
864  llabort = .true.
865 ENDIF
866 
867 IF (PRESENT(ld_openmp)) THEN
868  ll_openmp = ld_openmp
869 ELSE
870  ll_openmp = .true.
871 ENDIF
872 
873 IF( SIZE(pin) /= SIZE(pout) )THEN
874 #ifdef SFX_MPI
875  CALL mpl_message (cdmessage='SIZE(PIN) /= SIZE(POUT)', &
876  & cdstring='ORDER_INDEP_ALLREDUCE',ldabort=.true.)
877 #endif
878 ENDIF
879 
880 in = SIZE(pin)
881 
882 IF (inproc==1) THEN
883  pout(:)=pin(:)
884 ELSE
885  itag=1234
886  ALLOCATE(icount(inproc))
887  icount(:) = 0
888 
889 ! Determine distribution of input array over tasks
890 
891  DO j=1,in
892  i=mod(j-1,inproc)+1
893  icount(i)=icount(i)+1
894  ENDDO
895  ALLOCATE(ind(inproc))
896  ind(:)=0
897  ind(1)=1
898  DO j=2,inproc
899  ind(j)=ind(j-1)+icount(j-1)
900  ENDDO
901 #ifdef SFX_MPI
902  myproc = mpl_myrank()
903 #else
904  myproc = 0
905 #endif
906  ALLOCATE(zbuff(icount(myproc)*inproc))
907  ALLOCATE(ireq(2*inproc))
908  ALLOCATE(zin(inproc))
909 
910 ! Distribute input array over tasks
911 
912  ir=0
913  IF(icount(myproc) /= 0)THEN
914  DO j=1,inproc
915  ir=ir+1
916 #ifdef SFX_MPI
917  CALL mpl_recv (zbuff((j-1)*icount(myproc)+1:j*icount(myproc)),&
918  &ksource=j,&
919  &ktag=itag,&
920  &kmp_type=jp_non_blocking_standard,&
921  &krequest=ireq(ir),&
922  &cdstring='ORDER_INDEP_ALLREDUCE')
923 #endif
924  ENDDO
925  ENDIF
926  DO j=1,inproc
927  IF(icount(j) /= 0)THEN
928  ir=ir+1
929 #ifdef SFX_MPI
930  CALL mpl_send(pin(ind(j):ind(j)+icount(j)-1),&
931  &kdest=j,&
932  &ktag=itag,&
933  &kmp_type=jp_non_blocking_standard,&
934  &krequest=ireq(ir),&
935  &cdstring='ORDER_INDEP_ALLREDUCE')
936 #endif
937  ENDIF
938  ENDDO
939  IF(ir > 0)THEN
940 #ifdef SFX_MPI
941  CALL mpl_wait(zdum,krequest=ireq(1:ir),&
942  &cdstring='ORDER_INDEP_ALLREDUCE')
943 #endif
944  ENDIF
945 
946 ! Perform local order independent sums for myproc's part of input array
947 
948  ALLOCATE(zout(in))
949  DO j=1,icount(myproc)
950  DO i=1,inproc
951  zin(i)=zbuff((i-1)*icount(myproc)+j)
952  ENDDO
953  zout(j)=order_indep_local_sum(zin,llabort,ll_openmp)
954  ENDDO
955 
956 ! Gather results of order independent sums over tasks
957 #ifdef SFX_MPI
958  CALL mpl_allgatherv (zout(1:icount(myproc)),pout,icount, &
959  & cdstring='ORDER_INDEP_ALLREDUCE')
960 #endif
961  DEALLOCATE (icount)
962  DEALLOCATE (ind)
963  DEALLOCATE (ireq)
964  DEALLOCATE (zbuff)
965  DEALLOCATE (zin)
966  DEALLOCATE (zout)
967 
968 ENDIF
969 
970 IF (lhook) CALL dr_hook ('ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_ALLREDUCE', &
971  & 1,zhook_handle)
972 
973 END SUBROUTINE order_indep_allreduce
974 
975 FUNCTION order_indep_dot_product (P1,P2,PW,KNG,LD_ABORT_IFNOT_REPROD, &
976  & LD_OPENMP)
978 !-----------------------------------------------------------------
979 !
980 ! Returns an accurate global sum of the elements of P1*P2, or
981 ! P1*P2*PW. The result is identical to the result that would be
982 ! obtained by the following:
983 !
984 ! IF (PRESENT(PW)) THEN
985 ! PTEMP(:) = P1(:)*P2(:)*PW(:)
986 ! ELSE
987 ! PTEMP(:) = P1(:)*P2(:)
988 ! ENDIF
989 ! CALL ORDER_INDEP_GLOBAL_SUM (PTEMP,KNG,LD_ABORT_IFNOT_REPROD, &
990 ! & LD_OPENMP)
991 !
992 ! This routine is provided only because the above is not very
993 ! cache-friendly.
994 !
995 ! NB: P1, P2 and PW are unmodified on return
996 !
997 ! Arguments:
998 ! ----------
999 !
1000 ! Required:
1001 !
1002 ! P1,P2 - INTENT(IN) - Arrays whose inner product is
1003 ! to be calculated
1004 !
1005 !
1006 ! Optional:
1007 !
1008 ! PW - INTENT(IN) - Weight array defining the
1009 ! metric for the inner product.
1010 !
1011 ! KNG - INTENT(IN) - Global length of array.
1012 !
1013 ! LD_ABORT_IFNOT_REPRO - INTENT(IN) - Defines behaviour in case
1014 ! a reproducible result cannot
1015 ! be guaranteed.
1016 !
1017 ! LD_OPENMP - INTENT(IN) - Use OpenMP parallelization.
1018 !
1019 !
1020 ! Algorithm:
1021 ! ----------
1022 !
1023 ! The algorithm is based on Ogita et al. (2005) SIAM J. Sci. Computing,
1024 ! Vol.26, No.6, pp1955-1988. This is based in turn on an algorithm
1025 ! by Knuth (1969, seminumerical algorithms).
1026 !
1027 ! This version adds a second layer of parallelism on top of that
1028 ! provided by COMPENSATED_SUM_OMP. It iterates the compensated
1029 ! summation until the result is guaranteed to be within 4*eps
1030 ! of the true sum. It then rounds the result to the nearest
1031 ! floating-point number whose last three bits are zero, thereby
1032 ! guaranteeing an order-independent result.
1033 !
1034 ! Author: Mike Fisher ECMWF 2006/02/08
1035 !
1036 !-----------------------------------------------------------------
1037 
1038 IMPLICIT NONE
1039 
1040 REAL(KIND=JPRB) :: ORDER_INDEP_DOT_PRODUCT
1041 
1042 REAL(KIND=JPRB), INTENT(IN) :: P1(:), P2(:)
1043 REAL(KIND=JPRB), OPTIONAL, INTENT(IN) :: PW(:)
1044 INTEGER(KIND=JPIM),OPTIONAL, INTENT(IN) :: KNG
1045 LOGICAL, OPTIONAL, INTENT(IN) :: LD_ABORT_IFNOT_REPROD, LD_OPENMP
1046 
1047 INTEGER(KIND=JPIM) :: J,IN,ING,INPROC
1048 REAL(KIND=JPRB) :: ZCORR,ZERR,ZOLDERR,ZBUFFL(3),ZBETA,ZRES
1049 REAL(KIND=JPRB), ALLOCATABLE :: ZPSUMS(:),ZPERRS(:),ZPCORS(:), &
1050  & ZBUFFG(:),ZP(:)
1051 INTEGER(KIND=JPIM), ALLOCATABLE :: IRECVCOUNTS(:)
1052 LOGICAL :: LLABORT, LL_OPENMP, LL_FIRST_ITER
1053 REAL(KIND=JPRB) :: ZHOOK_HANDLE
1054 
1055 INTEGER(KIND=JPIM), SAVE :: INMSG=0
1056 
1057 INTEGER(KIND=JPIM), EXTERNAL :: N_PRECISION
1058 
1059 IF (lhook) CALL dr_hook ( &
1060  &'ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_DOT_PRODUCT', &
1061  & 0,zhook_handle)
1062 #ifdef SFX_MPI
1063 inproc = mpl_nproc()
1064 #else
1065 inproc = 1
1066 #endif
1067 IF (PRESENT(ld_abort_ifnot_reprod)) THEN
1068  llabort = ld_abort_ifnot_reprod
1069 ELSE
1070  llabort = .true.
1071 ENDIF
1072 
1073 IF (PRESENT(ld_openmp)) THEN
1074  ll_openmp = ld_openmp
1075 ELSE
1076  ll_openmp = .true.
1077 ENDIF
1078 
1079 in = SIZE(p1)
1080 
1081 IF (SIZE(p2)/=in) THEN
1082 #ifdef SFX_MPI
1083  CALL mpl_message (cdmessage='SIZE(P2)/=SIZE(P1)', &
1084  & cdstring='ORDER_INDEP_DOT_PRODUCT',ldabort=.true.)
1085 #endif
1086 ENDIF
1087 
1088 IF (PRESENT(pw)) THEN
1089  IF (SIZE(pw)/=in) THEN
1090 #ifdef SFX_MPI
1091  CALL mpl_message (cdmessage='SIZE(PW)/=SIZE(P1)', &
1092  & cdstring='ORDER_INDEP_DOT_PRODUCT',ldabort=.true.)
1093 #endif
1094  ENDIF
1095 ENDIF
1096 
1097 !--- global length of vector (needed for error bound calculation)
1098 
1099 IF (.NOT.PRESENT(kng)) THEN
1100  ing = in
1101  IF (inproc>1) THEN
1102 #ifdef SFX_MPI
1103  CALL mpl_allreduce (ing,'SUM',cdstring='ORDER_INDEP_DOT_PRODUCT')
1104 #endif
1105  ENDIF
1106 ELSE
1107  ing = kng
1108  IF (kng<in) THEN
1109 #ifdef SFX_MPI
1110  CALL mpl_message (cdmessage='Specified KNG < SIZE(PIN)', &
1111  & cdstring='ORDER_INDEP_DOT_PRODUCT',ldabort=.true.)
1112 #endif
1113  ENDIF
1114 ENDIF
1115 
1116 IF (REAL(2*ing,jprb)*EPSILON(zres) >= 1.0) then
1117 #ifdef SFX_MPI
1118  CALL mpl_message (cdmessage='n is too large to guarantee error bounds', &
1119  & cdstring='ORDER_INDEP_DOT_PRODUCT',ldabort=.true.)
1120 #endif
1121 ENDIF
1122 
1123 ALLOCATE (zp(max(in,1_jpim)))
1124 ALLOCATE (zbuffg(inproc*SIZE(zbuffl)))
1125 ALLOCATE (zpsums(inproc))
1126 ALLOCATE (zperrs(inproc))
1127 ALLOCATE (zpcors(inproc))
1128 ALLOCATE (irecvcounts(inproc))
1129 
1130 zolderr = huge(zerr)
1131 
1132 !--- Copy the input array. This avoids some tricky indexing, at the
1133 !--- expense of some inefficency.
1134 
1135 IF (in==0) THEN
1136  zp(1) = 0.0_jprb
1137 ENDIF
1138 
1139 ll_first_iter = .true.
1140 
1141 k_loop: DO
1142 
1143 !--- transform local arrays
1144 
1145  IF (in>0) THEN
1146  IF (ll_first_iter) THEN
1147  IF (PRESENT(pw)) THEN
1148  IF (ll_openmp) THEN
1149  CALL compensated_dot_product_omp (p1,p2,pw,zp,in,zcorr,zerr)
1150  ELSE
1151  CALL compensated_dot_product (p1,p2,pw,zp,in,zcorr,zerr)
1152  ENDIF
1153  ELSE
1154  IF (ll_openmp) THEN
1155  CALL compensated_dot_product_omp (p1=p1,p2=p2,pout=zp, &
1156  & kn=in,pcorr=zcorr,perr=zerr)
1157  ELSE
1158  CALL compensated_dot_product (p1=p1,p2=p2,pout=zp, &
1159  & kn=in,pcorr=zcorr,perr=zerr)
1160  ENDIF
1161  ENDIF
1162  ELSE
1163  IF (ll_openmp) THEN
1164  CALL compensated_sum_omp (zp,in,zcorr,zerr)
1165  ELSE
1166  CALL compensated_sum (zp,in,zcorr,zerr)
1167  ENDIF
1168  ENDIF
1169  ENDIF
1170 
1171 !--- gather partial sums and error bounds to all processors
1172 
1173  zbuffl(1) = zp(max(in,1_jpim))
1174 
1175  IF (in>0) THEN
1176  zbuffl(2) = zerr
1177  zbuffl(3) = zcorr
1178  ELSE
1179  zbuffl(2) = 0.0_jprb
1180  zbuffl(3) = 0.0_jprb
1181  ENDIF
1182 
1183  IF (inproc>1) THEN
1184 !-- could use MPL_ALLGATHER here, if it existed!
1185 
1186  irecvcounts(:) = SIZE(zbuffl)
1187 #ifdef SFX_MPI
1188  CALL mpl_allgatherv (zbuffl,zbuffg,irecvcounts, &
1189  & cdstring='ORDER_INDEP_DOT_PRODUCT')
1190 #endif
1191  DO j=1,inproc
1192  zpsums(j) = zbuffg(1+(j-1)*SIZE(zbuffl))
1193  zperrs(j) = zbuffg(2+(j-1)*SIZE(zbuffl))
1194  zpcors(j) = zbuffg(3+(j-1)*SIZE(zbuffl))
1195  ENDDO
1196  ELSE
1197  zpsums(1) = zbuffl(1)
1198  zperrs(1) = zbuffl(2)
1199  zpcors(1) = zbuffl(3)
1200  ENDIF
1201 
1202 !--- transform partial sums
1203 
1204  CALL compensated_sum (zpsums,inproc,zcorr,zerr)
1205  zerr = zerr + sum(zperrs)
1206  zcorr = zcorr + sum(zpcors)
1207 
1208 !--- Calculate final result
1209 
1210  zres = zpsums(inproc) + zcorr
1211 
1212 !--- Calculate error bound. This is corollary 4.7 from Ogita et al. (2005)
1213 
1214  zbeta = zerr*(REAL(2*ing,jprb)*EPSILON(zres)) &
1215  & /(1.0_JPRB - REAL(2*ING,JPRB)*EPSILON(ZRES))
1216 
1217  zerr = epsilon(zres)*abs(zres) &
1218  & +(zbeta + ( 2.0_jprb*epsilon(zres)*epsilon(zres)*abs(zres) &
1219  & +3.0_jprb*tiny(zres)))
1220 
1221 !--- update the last element of the local array
1222 #ifdef SFX_MPI
1223  zp(max(in,1_jpim)) = zpsums(mpl_myrank())
1224 #endif
1225 
1226 !--- exit if the global error is small enough
1227 
1228  IF (zerr<4.0_jprb*spacing(zres)) EXIT k_loop
1229 
1230 !--- Take appropriate action if ZRES cannot be sufficiently refined.
1231 
1232  IF (zerr >= zolderr) THEN
1233  inmsg=inmsg+1
1234 
1235  IF (inmsg<=100) THEN
1236 #ifdef SFX_MPI
1237  CALL mpl_message ( &
1238  & cdmessage= 'ORDER_INDEP_DOT_PRODUCT: FALIED TO REFINE SUM', &
1239  & cdstring='ORDER_INDEP_DOT_PRODUCT')
1240  CALL mpl_message ( &
1241  & cdmessage='WARNING: POSSIBLITY OF NON-REPRODUCIBLE RESULTS',&
1242  & cdstring='ORDER_INDEP_DOT_PRODUCT')
1243 #endif
1244  ENDIF
1245 
1246  IF (inmsg==100) THEN
1247 #ifdef SFX_MPI
1248  CALL mpl_message ( &
1249  & cdmessage='ORDER_INDEP_DOT_PRODUCT: INMSG>100. OUTPUT SUPPRESSED',&
1250  & cdstring='ORDER_INDEP_DOT_PRODUCT')
1251 #endif
1252  ENDIF
1253 
1254  IF (llabort) THEN
1255 #ifdef SFX_MPI
1256  CALL mpl_message (cdmessage= &
1257  & 'ABORT BECAUSE LD_ABORT_IFNOT_REPROD WAS SET', &
1258  & cdstring='ORDER_INDEP_DOT_PRODUCT',ldabort=.true.)
1259 #endif
1260  ENDIF
1261  ENDIF
1262 
1263  zolderr = zerr
1264 
1265  ll_first_iter = .false.
1266 ENDDO k_loop
1267 
1268 !--- At this stage, we have guaranteed that ZRES less than 4*EPS
1269 !--- away from the exact sum. There are only four floating point
1270 !--- numbers in this range. So, if we find the nearest number that
1271 !--- has its last three bits zero, then we have a reproducible result.
1272 
1274 
1275 DEALLOCATE (irecvcounts)
1276 DEALLOCATE (zpcors)
1277 DEALLOCATE (zperrs)
1278 DEALLOCATE (zpsums)
1279 DEALLOCATE (zbuffg)
1280 DEALLOCATE (zp)
1281 
1282 IF (lhook) CALL dr_hook ( &
1283  &'ORDER_INDEPENDENT_SUMMATION_MOD:ORDER_INDEP_DOT_PRODUCT', &
1284  & 1,zhook_handle)
1285 
1286 END FUNCTION order_indep_dot_product
1287 
1288 FUNCTION round (PRES)
1290 !-----------------------------------------------------------------
1291 !
1292 ! Returns the value of PRES rounded to the nearest floating-point
1293 ! number that has its last three bits zero
1294 
1295 ! The code to do this in Fortran is not nice, because Fortran
1296 ! does not proved access to the binary representation for REALs.
1297 ! Perhaps we should code it in c?
1298 
1299 ! This works on big-endian and little-endian machines.
1300 
1301 ! Author: Mike Fisher ECMWF 2006/02/08
1302 !
1303 !-----------------------------------------------------------------
1304 
1305 IMPLICIT NONE
1306 
1307 REAL(KIND=JPRB), INTENT(IN) :: PRES
1308 REAL(KIND=JPRB) :: ROUND
1309 
1310 INTEGER(KIND=JPIM) :: II(2),IEQUIV(8),INTS_PER_REAL,J,I_LOW_WORD
1311 REAL(KIND=JPRB) :: ZZ(2),ZUP,ZDOWN
1312 
1313 INTEGER(KIND=JPIM), EXTERNAL :: N_PRECISION
1314 
1315 
1316 ii(:)=1
1317 zz(:)=1.0_jprb
1318 ints_per_real=n_precision(zz)/n_precision(ii)
1319 
1320 IF (ints_per_real>SIZE(iequiv)) THEN
1321 #ifdef SFX_MPI
1322  CALL mpl_message (cdmessage='INTS_PER_REAL>SIZE(IEQUIV)', &
1323  & cdstring='ORDER_INDEP_GLOBAL_SUM',ldabort=.true.)
1324 #endif
1325 ENDIF
1326 
1327 !--- Test whether big-endian or little-endian
1328 
1329 zup = -1.0_jprb
1330 iequiv(1:ints_per_real) = transfer(zup,iequiv(1:ints_per_real))
1331 
1332 IF (iequiv(1)==0) THEN
1333  i_low_word = 1 ! Little-endian
1334 ELSE
1335  i_low_word = ints_per_real ! Big-endian
1336 ENDIF
1337 
1338 !--- Find the nearest number with all 3 lowest-order bits zeroed
1339 
1340 iequiv(1:ints_per_real) = transfer(pres,iequiv(1:ints_per_real))
1341 zup = pres
1342 zdown = pres
1343 
1344 IF (ibits(iequiv(i_low_word),0,3)/=0) THEN
1345  DO j=1,4
1346  zup=nearest(zup,1.0_jprb)
1347  iequiv(1:ints_per_real) = transfer(zup,iequiv(1:ints_per_real))
1348  IF (ibits(iequiv(i_low_word),0,3)==0) EXIT
1349 
1350  zdown=nearest(zdown,-1.0_jprb)
1351  iequiv(1:ints_per_real) = transfer(zdown,iequiv(1:ints_per_real))
1352  IF (ibits(iequiv(i_low_word),0,3)==0) EXIT
1353  ENDDO
1354 
1355  IF (ibits(iequiv(i_low_word),0,3)/=0) THEN
1356 #ifdef SFX_MPI
1357  CALL mpl_message (cdmessage='THIS IS NOT POSSIBLE', &
1358  & cdstring='ORDER_INDEP_GLOBAL_SUM',ldabort=.true.)
1359 #endif
1360  ENDIF
1361 ENDIF
1362 
1363 round = transfer(iequiv(1:ints_per_real),pres)
1364 
1365 END FUNCTION round
1366 
integer, parameter jpim
Definition: parkind1.F90:13
integer, parameter jprb
Definition: parkind1.F90:32
intent(out) overrides sub arrays one Sort by the least significant key first sum(iindex(1:n))
logical lhook
Definition: yomhook.F90:15