SURFEX v8.1
General documentation of Surfex
compensated_summation_mod.F90
Go to the documentation of this file.
2 USE parkind1, ONLY : jprb
3 USE yomhook , ONLY : lhook, dr_hook
4 
5 !**** COMPENSATED_SUMMATION_MOD
6 ! Purpose.
7 ! --------
8 ! Functions to perform compensated (i.e. accurate) summation.
9 ! These functions are used by ORDER_INDEPENDENT_SUMMATION_MOD
10 
11 !** Interface.
12 ! ----------
13 
14 ! CALL COMPENSATED_SUM (P,KN,PCORR,PERR)
15 ! CALL COMPENSATED_SUM_OMP (P,KN,PCORR,PERR)
16 !
17 ! These routines transform the elements of the array P, such that:
18 !
19 ! 1) p(kn) contains sum(p)
20 !
21 ! 2) p(1)...p(kn-1) contain the rounding errors that were made
22 ! in calculating sum(p).
23 ! 3) The exact sum of the elements of p is unmodified.
24 !
25 ! On return, pcorr contains the sum of the rounding errors, perr
26 ! contains the sum of their absolute values.
27 !
28 ! After calling this routine, an accurate sum of the elements of p
29 ! can be calculated as res=p(n)+pcorr.
30 !
31 ! CALL COMPENSATED_SUM_OMP is an OpenMP-parallelized version of
32 ! CALL COMPENSATED_SUM.
33 
34 
35 ! CALL COMPENSATED_DOT_PRODUCT (P1,P2, POUT,PCORR,PERR)
36 ! CALL COMPENSATED_DOT_PRODUCT (P1,P2,PW,POUT,PCORR,PERR)
37 ! CALL COMPENSATED_DOT_PRODUCT_OMP (P1,P2, POUT,PCORR,PERR)
38 ! CALL COMPENSATED_DOT_PRODUCT_OMP (P1,P2,PW,POUT,PCORR,PERR)
39 !
40 ! These routines are variations on COMPENSATED_SUM that are
41 ! provided simply to reduce memory-access time.
42 
43 ! CALL COMPENSATED_DOT_PRODUCT (P1,P2,POUT,PCORR,PERR)
44 !
45 ! is functionally equivalent to the following:
46 !
47 ! POUT(:) = P1(:)*P2(:)
48 ! CALL CALL COMPENSATED_SUM (POUT,KN,PCORR,PERR)
49 
50 ! CALL COMPENSATED_DOT_PRODUCT_OMP (P1,P2,POUT,PCORR,PERR)
51 !
52 ! is functionally equivalent to the following:
53 !
54 ! POUT(:) = P1(:)*P2(:)
55 ! CALL CALL COMPENSATED_SUM_OMP (POUT,KN,PCORR,PERR)
56 
57 ! CALL COMPENSATED_DOT_PRODUCT (P1,P2,PW,POUT,PCORR,PERR)
58 !
59 ! is functionlly equivalent to the following:
60 !
61 ! POUT(:) = P1(:)*P2(:)*PW(:)
62 ! CALL CALL COMPENSATED_SUM (POUT,KN,PCORR,PERR)
63 
64 ! CALL COMPENSATED_DOT_PRODUCT_OMP (P1,P2,PW,POUT,PCORR,PERR)
65 !
66 ! is functionlly equivalent to the following:
67 !
68 ! POUT(:) = P1(:)*P2(:)*PW(:)
69 ! CALL CALL COMPENSATED_SUM_OMP (POUT,KN,PCORR,PERR)
70 
71 !** Algorithm
72 ! ---------
73 
74 ! The algorithm is based on Ogita et al. (2005) SIAM J. Sci. Computing,
75 ! Vol.26, No.6, pp1955-1988. This is based in turn on an algorithm
76 ! by Knuth (1969, seminumerical algorithms).
77 !
78 ! The basic idea is that we can transform a pair of floating-point
79 ! numbers "a" and "b" into a new pair "x" and "y", such that:
80 !
81 ! x = add(a,b) and x+y = a+b
82 !
83 ! where "add" denotes floating-point addition, and "+" denotes
84 ! exact, mathematical (i.e. infinite-precision) addition.
85 !
86 ! Applying this to an array, p, we can transform the array such that:
87 !
88 ! p(kn) := add(p(1),p(2),...,p(kn))
89 ! and p(1) + p(2) + ... + p(kn) is unchanged.
90 
91 
92 ! Author.
93 ! -------
94 ! Mike Fisher ECMWF
95 
96 ! Modifications.
97 ! --------------
98 ! Original: 2006-20-22
99 
100 ! ------------------------------------------------------------------
101 
102 USE parkind1 ,ONLY : jpim ,jprb
103 
104 SAVE
105 PRIVATE
106 PUBLIC compensated_sum, &
110 
112  MODULE PROCEDURE compensated_sum
113 END INTERFACE compensated_sum
114 
116  MODULE PROCEDURE compensated_sum_omp
117 END INTERFACE compensated_sum_omp
118 
120  MODULE PROCEDURE compensated_dot_product
121 END INTERFACE compensated_dot_product
122 
124  MODULE PROCEDURE compensated_dot_product_omp
125 END INTERFACE compensated_dot_product_omp
126 
127 CONTAINS
128 
129 SUBROUTINE compensated_sum (P,KN,PCORR,PERR)
130  IMPLICIT NONE
131 
132  INTEGER(KIND=JPIM), INTENT(IN) :: KN
133  REAL(KIND=JPRB), INTENT(INOUT) :: P(kn)
134  REAL(KIND=JPRB), INTENT(OUT) :: PCORR, PERR
135 
136  REAL(KIND=JPRB) :: ZX,ZZ,ZPSUM
137  INTEGER(KIND=JPIM) :: J
138 
139  REAL(KIND=JPRB) :: ZHOOK_HANDLE
140  IF (lhook) CALL dr_hook('COMPENSATED_SUMMATION_MOD:COMPENSATED_SUM',0,zhook_handle)
141  pcorr = 0.0
142  perr = 0.0
143 
144  zpsum = p(1)
145  DO j=2,kn
146 !--- It is vital that these 4 lines are not optimized in any way that
147 !--- changes the results.
148  zx = p(j) + zpsum
149  zz = zx - p(j)
150  p(j-1) = (p(j)-(zx-zz)) + (zpsum-zz)
151  zpsum = zx
152 !--- accumulate the correction and the error
153  pcorr = pcorr + p(j-1)
154  perr = perr + abs(p(j-1))
155  ENDDO
156  p(kn) = zpsum
157 
158 !-----------------------------------------------------------------
159 ! Vectorization
160 ! -------------
161 !
162 ! NB: As coded, the above loop may not run very well on a vector
163 ! computer. However, any loop ordering that preserves the exact
164 ! sum, and ends up with the floating-point sum in the last
165 ! element and rounding errors in the rest of the array,
166 ! should be OK. For example:
167 !
168 ! ILEN=KN
169 ! DO
170 ! IHALF=ILEN/2
171 ! !--- no vector dependency
172 ! DO J=1,IHALF
173 !--- It is vital that these 4 lines are not optimized in any way that
174 !--- changes the results.
175 ! ZX = P(KN-IHALF+J) + P(KN-ILEN+J)
176 ! ZZ = ZX - P(KN-IHALF+J)
177 ! P(KN-ILEN+J) = (P(KN-IHALF+J)-(ZX-ZZ)) + (P(KN-ILEN+J)-ZZ)
178 ! P(KN-IHALF+J) = ZX
179 !--- accumulate the correction and the error
180 ! PCORR = PCORR + P(KN-ILEN+J)
181 ! PERR = PERR + ABS(P(KN-ILEN+J))
182 ! ENDDO
183 ! ILEN=ILEN-IHALF
184 ! IF (ILEN<=1) EXIT
185 ! ENDDO
186 !-----------------------------------------------------------------
187 
188 IF (lhook) CALL dr_hook('COMPENSATED_SUMMATION_MOD:COMPENSATED_SUM',1,zhook_handle)
189 END SUBROUTINE compensated_sum
190 
191 SUBROUTINE compensated_sum_omp (P,KN,PCORR,PERR)
193 
194  IMPLICIT NONE
195 
196  INTEGER(KIND=JPIM), INTENT(IN) :: KN
197  REAL(KIND=JPRB), INTENT(INOUT) :: P(kn)
198  REAL(KIND=JPRB), INTENT(OUT) :: PCORR, PERR
199 
200  REAL(KIND=JPRB), ALLOCATABLE :: ZERRS(:),ZCORS(:)
201  REAL(KIND=JPRB) :: ZX,ZZ
202  INTEGER(KIND=JPIM) :: J,JCHUNK,ILEN,INCHUNKS,IMINLEN,ILENCHUNK, &
203  & INTHREADS,I,ISTART,IEND
204 
205 !--- IMINLEN is a tunable parameter. It represents the vector length
206 !--- below which there is too little work to make it worth spawning threads
207 
208  REAL(KIND=JPRB) :: ZHOOK_HANDLE
209  IF (lhook) CALL dr_hook('COMPENSATED_SUMMATION_MOD:COMPENSATED_SUM_OMP',0,zhook_handle)
210  iminlen=1000 !-- this value is pure guesswork (Mike Fisher)
211 
212  inthreads = oml_max_threads()
213 
214  ilenchunk = max(iminlen,(kn+inthreads-1)/inthreads)
215  inchunks=1+(kn-1)/ilenchunk
216 
217  ALLOCATE(zerrs(inchunks))
218  ALLOCATE(zcors(inchunks))
219 
220 !--- First, we split the array into chunks, and apply compensated_sum
221 !--- to each chunk independently.
222 
223 !$OMP PARALLEL DO PRIVATE(ISTART,IEND), SCHEDULE(STATIC), IF(INCHUNKS>1)
224  DO jchunk=1,inchunks
225  istart = 1+(jchunk-1)*ilenchunk
226  iend = min(jchunk*ilenchunk,kn)
227  CALL compensated_sum (p(istart:iend),1+iend-istart, &
228  & zcors(jchunk), zerrs(jchunk))
229  ENDDO
230 !$OMP END PARALLEL DO
231 
232  pcorr = sum(zcors)
233  perr = sum(zerrs)
234 
235 !--- The final element of each chunk contains a partial sum. We apply
236 !--- compensated summation to the vector of the final elements.
237 
238  DO jchunk=2,inchunks
239  i = min(jchunk*ilenchunk,kn)
240  ilen = i - (jchunk-1)*ilenchunk
241 !--- It is vital that these 4 lines are not optimized
242  zx = p(i) + p(i-ilen)
243  zz = zx - p(i)
244  p(i-ilen) = (p(i)-(zx-zz)) + (p(i-ilen)-zz)
245  p(i) = zx
246 !--- accumulate the result and its error bound
247  pcorr = pcorr + p(i-ilen)
248  perr = perr + abs(p(i-ilen))
249  ENDDO
250 
251  DEALLOCATE(zerrs)
252  DEALLOCATE(zcors)
253 IF (lhook) CALL dr_hook('COMPENSATED_SUMMATION_MOD:COMPENSATED_SUM_OMP',1,zhook_handle)
254 END SUBROUTINE compensated_sum_omp
255 
256 SUBROUTINE compensated_dot_product (P1,P2,PW,POUT,KN,PCORR,PERR)
257  IMPLICIT NONE
258 
259  INTEGER(KIND=JPIM), INTENT(IN) :: KN
260  REAL(KIND=JPRB), INTENT(IN) :: P1(kn), P2(kn)
261  REAL(KIND=JPRB), INTENT(IN), OPTIONAL :: PW(kn)
262  REAL(KIND=JPRB), INTENT(OUT) :: POUT(kn)
263  REAL(KIND=JPRB), INTENT(OUT) :: PCORR, PERR
264 
265  REAL(KIND=JPRB) :: ZX,ZZ,ZPJ,ZPSUM
266  INTEGER(KIND=JPIM) :: J
267 !==================================================================
268 !INTEGER(KIND=JPIM) :: ILEN, IHALF
269 !==================================================================
270 
271  REAL(KIND=JPRB) :: ZHOOK_HANDLE
272  IF (lhook) CALL dr_hook('COMPENSATED_SUMMATION_MOD:COMPENSATED_DOT_PRODUCT',0,zhook_handle)
273  pcorr = 0.0
274  perr = 0.0
275 
276  IF (PRESENT(pw)) THEN
277  zpsum = p1(1)*p2(1)*pw(1)
278  ELSE
279  zpsum = p1(1)*p2(1)
280  ENDIF
281 
282  DO j=2,kn
283  IF (PRESENT(pw)) THEN
284  zpj = p1(j)*p2(j)*pw(j)
285  ELSE
286  zpj = p1(j)*p2(j)
287  ENDIF
288 !--- It is vital that these 4 lines are not optimized in any way that
289 !--- changes the results.
290  zx = zpj + zpsum
291  zz = zx - zpj
292  pout(j-1) = (zpj-(zx-zz)) + (zpsum-zz)
293  zpsum = zx
294 !--- accumulate the correction and the error
295  pcorr = pcorr + pout(j-1)
296  perr = perr + abs(pout(j-1))
297  ENDDO
298  pout(kn) = zpsum
299 
300 !==================================================================
301 !== vectorized version
302 ! DO J=1,KN
303 ! IF (PRESENT(PW)) THEN
304 ! POUT(J) = P1(J)*P2(J)*PW(J)
305 ! ELSE
306 ! POUT(J) = P1(J)*P2(J)
307 ! ENDIF
308 ! ENDDO
309 !
310 ! ILEN=KN
311 ! DO
312 ! IHALF=ILEN/2
313 ! !--- no vector dependency
314 ! DO J=1,IHALF
315 !!--- It is vital that these 4 lines are not optimized in any way that
316 !!--- changes the results.
317 ! ZX = POUT(KN-IHALF+J) + POUT(KN-ILEN+J)
318 ! ZZ = ZX - POUT(KN-IHALF+J)
319 ! POUT(KN-ILEN+J) = (POUT(KN-IHALF+J)-(ZX-ZZ)) + (POUT(KN-ILEN+J)-ZZ)
320 ! POUT(KN-IHALF+J) = ZX
321 !!--- accumulate the correction and the error
322 ! PCORR = PCORR + POUT(KN-ILEN+J)
323 ! PERR = PERR + ABS(POUT(KN-ILEN+J))
324 ! ENDDO
325 ! ILEN=ILEN-IHALF
326 ! IF (ILEN<=1) EXIT
327 ! ENDDO
328 !==================================================================
329 IF (lhook) CALL dr_hook('COMPENSATED_SUMMATION_MOD:COMPENSATED_DOT_PRODUCT',1,zhook_handle)
330 END SUBROUTINE compensated_dot_product
331 
332 SUBROUTINE compensated_dot_product_omp (P1,P2,PW,POUT,KN,PCORR,PERR)
334 
335  IMPLICIT NONE
336 
337  INTEGER(KIND=JPIM), INTENT(IN) :: KN
338  REAL(KIND=JPRB), INTENT(IN) :: P1(kn), P2(kn)
339  REAL(KIND=JPRB), INTENT(IN), OPTIONAL :: PW(kn)
340  REAL(KIND=JPRB), INTENT(OUT) :: POUT(kn)
341  REAL(KIND=JPRB), INTENT(OUT) :: PCORR, PERR
342 
343  REAL(KIND=JPRB), ALLOCATABLE :: ZERRS(:),ZCORS(:)
344  REAL(KIND=JPRB) :: ZX,ZZ
345  INTEGER(KIND=JPIM) :: J,JCHUNK,ILEN,INCHUNKS,IMINLEN,ILENCHUNK, &
346  & INTHREADS,I,ISTART,IEND
347 
348 !--- IMINLEN is a tunable parameter. It represents the vector length
349 !--- below which there is too little work to make it worth spawning threads
350 
351  REAL(KIND=JPRB) :: ZHOOK_HANDLE
352  IF (lhook) CALL dr_hook('COMPENSATED_SUMMATION_MOD:COMPENSATED_DOT_PRODUCT_OMP',0,zhook_handle)
353  iminlen=1000 !-- this value is pure guesswork (Mike Fisher)
354 
355  inthreads = oml_max_threads()
356 
357  ilenchunk = max(iminlen,(kn+inthreads-1)/inthreads)
358  inchunks=1+(kn-1)/ilenchunk
359 
360  ALLOCATE(zerrs(inchunks))
361  ALLOCATE(zcors(inchunks))
362 
363 !--- First, we split the array into chunks, and apply compensated_sum
364 !--- to each chunk independently.
365 
366  IF (PRESENT(pw)) THEN
367 !$OMP PARALLEL DO PRIVATE(ISTART,IEND), SCHEDULE(STATIC), IF(INCHUNKS>1)
368  DO jchunk=1,inchunks
369  istart = 1+(jchunk-1)*ilenchunk
370  iend = min(jchunk*ilenchunk,kn)
371  CALL compensated_dot_product (p1(istart:iend),p2(istart:iend), &
372  & pw(istart:iend),pout(istart:iend), &
373  & 1+iend-istart, &
374  & zcors(jchunk), zerrs(jchunk))
375  ENDDO
376 !$OMP END PARALLEL DO
377  ELSE
378 !$OMP PARALLEL DO PRIVATE(ISTART,IEND), SCHEDULE(STATIC), IF(INCHUNKS>1)
379  DO jchunk=1,inchunks
380  istart = 1+(jchunk-1)*ilenchunk
381  iend = min(jchunk*ilenchunk,kn)
382  CALL compensated_dot_product (p1=p1(istart:iend),p2=p2(istart:iend), &
383  & pout=pout(istart:iend), &
384  & kn=1+iend-istart, &
385  & pcorr=zcors(jchunk), perr=zerrs(jchunk))
386  ENDDO
387 !$OMP END PARALLEL DO
388  ENDIF
389 
390  pcorr = sum(zcors)
391  perr = sum(zerrs)
392 
393 !--- The final element of each chunk contains a partial sum. We apply
394 !--- compensated summation to the vector of the final elements.
395 
396  DO jchunk=2,inchunks
397  i = min(jchunk*ilenchunk,kn)
398  ilen = i - (jchunk-1)*ilenchunk
399 !--- It is vital that these 4 lines are not optimized
400  zx = pout(i) + pout(i-ilen)
401  zz = zx - pout(i)
402  pout(i-ilen) = (pout(i)-(zx-zz)) + (pout(i-ilen)-zz)
403  pout(i) = zx
404 !--- accumulate the result and its error bound
405  pcorr = pcorr + pout(i-ilen)
406  perr = perr + abs(pout(i-ilen))
407  ENDDO
408 
409  DEALLOCATE(zerrs)
410  DEALLOCATE(zcors)
411 IF (lhook) CALL dr_hook('COMPENSATED_SUMMATION_MOD:COMPENSATED_DOT_PRODUCT_OMP',1,zhook_handle)
412 END SUBROUTINE compensated_dot_product_omp
413 
414 END MODULE compensated_summation_mod
integer, parameter jpim
Definition: parkind1.F90:13
integer, parameter jprb
Definition: parkind1.F90:32
intent(out) overrides sub arrays one Sort by the least significant key first sum(iindex(1:n))
logical lhook
Definition: yomhook.F90:15
integer(kind=jpim) function, public oml_max_threads()
Definition: oml_mod.F90:256