GADGET-4
pm_mpi_fft.cc
Go to the documentation of this file.
1/*******************************************************************************
2 * \copyright This file is part of the GADGET4 N-body/SPH code developed
3 * \copyright by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
4 * \copyright (vspringel@mpa-garching.mpg.de) and all contributing authors.
5 *******************************************************************************/
6
12#include "gadgetconfig.h"
13
14#if defined(PMGRID) || defined(NGENIC)
15
16#include <fftw3.h>
17#include <math.h>
18#include <mpi.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include <algorithm>
23
24#include "../data/allvars.h"
25#include "../data/dtypes.h"
26#include "../data/mymalloc.h"
27#include "../main/simulation.h"
28#include "../mpi_utils/mpi_utils.h"
29#include "../pm/pm.h"
30#include "../system/system.h"
31
32/* We only use the one-dimensional FFTW3 routines, because the MPI versions of FFTW3
33 * allocated memory for themselves during the transforms (which we want to strictly avoid),
34 * and because we want to allow transforms that are so big that more than 2GB may be
35 * transferred betweeen processors.
36 */
37
38#ifndef FFT_COLUMN_BASED
39
40void pm_mpi_fft::my_slab_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
41{
42 subdivide_evenly(NgridX, NTask, ThisTask, &plan->slabstart_x, &plan->nslab_x);
43 subdivide_evenly(NgridY, NTask, ThisTask, &plan->slabstart_y, &plan->nslab_y);
44
45 plan->slab_to_task = (int *)Mem.mymalloc_movable(&plan->slab_to_task, "slab_to_task", NgridX * sizeof(int));
46
47 for(int task = 0; task < NTask; task++)
48 {
49 int start, n;
50
51 subdivide_evenly(NgridX, NTask, task, &start, &n);
52
53 for(int i = start; i < start + n; i++)
54 plan->slab_to_task[i] = task;
55 }
56
57 MPI_Allreduce(&plan->nslab_x, &plan->largest_x_slab, 1, MPI_INT, MPI_MAX, Communicator);
58 MPI_Allreduce(&plan->nslab_y, &plan->largest_y_slab, 1, MPI_INT, MPI_MAX, Communicator);
59
60 plan->slabs_x_per_task = (int *)Mem.mymalloc_movable(&plan->slabs_x_per_task, "slabs_x_per_task", NTask * sizeof(int));
61 MPI_Allgather(&plan->nslab_x, 1, MPI_INT, plan->slabs_x_per_task, 1, MPI_INT, Communicator);
62
63 plan->first_slab_x_of_task = (int *)Mem.mymalloc_movable(&plan->first_slab_x_of_task, "first_slab_x_of_task", NTask * sizeof(int));
64 MPI_Allgather(&plan->slabstart_x, 1, MPI_INT, plan->first_slab_x_of_task, 1, MPI_INT, Communicator);
65
66 plan->slabs_y_per_task = (int *)Mem.mymalloc_movable(&plan->slabs_y_per_task, "slabs_y_per_task", NTask * sizeof(int));
67 MPI_Allgather(&plan->nslab_y, 1, MPI_INT, plan->slabs_y_per_task, 1, MPI_INT, Communicator);
68
69 plan->first_slab_y_of_task = (int *)Mem.mymalloc_movable(&plan->first_slab_y_of_task, "first_slab_y_of_task", NTask * sizeof(int));
70 MPI_Allgather(&plan->slabstart_y, 1, MPI_INT, plan->first_slab_y_of_task, 1, MPI_INT, Communicator);
71
72 plan->NgridX = NgridX;
73 plan->NgridY = NgridY;
74 plan->NgridZ = NgridZ;
75
76 int Ngridz = NgridZ / 2 + 1; /* dimension needed in complex space */
77
78 plan->Ngridz = Ngridz;
79 plan->Ngrid2 = 2 * Ngridz;
80}
81
82void pm_mpi_fft::my_slab_based_fft_free(fft_plan *plan)
83{
84 Mem.myfree(plan->first_slab_y_of_task);
85 Mem.myfree(plan->slabs_y_per_task);
86 Mem.myfree(plan->first_slab_x_of_task);
87 Mem.myfree(plan->slabs_x_per_task);
88 Mem.myfree(plan->slab_to_task);
89}
90
101void pm_mpi_fft::my_slab_transposeA(fft_plan *plan, fft_real *field, fft_real *scratch)
102{
103 int n, prod, task, flag_big = 0, flag_big_all = 0;
104
105 prod = NTask * plan->nslab_x;
106
107 for(n = 0; n < prod; n++)
108 {
109 int x = n / NTask;
110 int task = n % NTask;
111
112 int y;
113
114 for(y = plan->first_slab_y_of_task[task]; y < plan->first_slab_y_of_task[task] + plan->slabs_y_per_task[task]; y++)
115 memcpy(scratch + ((size_t)plan->NgridZ) * (plan->first_slab_y_of_task[task] * plan->nslab_x +
116 x * plan->slabs_y_per_task[task] + (y - plan->first_slab_y_of_task[task])),
117 field + ((size_t)plan->Ngrid2) * (plan->NgridY * x + y), plan->NgridZ * sizeof(fft_real));
118 }
119
120 size_t *scount = (size_t *)Mem.mymalloc("scount", NTask * sizeof(size_t));
121 size_t *rcount = (size_t *)Mem.mymalloc("rcount", NTask * sizeof(size_t));
122 size_t *soff = (size_t *)Mem.mymalloc("soff", NTask * sizeof(size_t));
123 size_t *roff = (size_t *)Mem.mymalloc("roff", NTask * sizeof(size_t));
124
125 for(task = 0; task < NTask; task++)
126 {
127 scount[task] = plan->nslab_x * plan->slabs_y_per_task[task] * (plan->NgridZ * sizeof(fft_real));
128 rcount[task] = plan->nslab_y * plan->slabs_x_per_task[task] * (plan->NgridZ * sizeof(fft_real));
129
130 soff[task] = plan->first_slab_y_of_task[task] * plan->nslab_x * (plan->NgridZ * sizeof(fft_real));
131 roff[task] = plan->first_slab_x_of_task[task] * plan->nslab_y * (plan->NgridZ * sizeof(fft_real));
132
133 if(scount[task] > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
134 flag_big = 1;
135 }
136
137 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
138
139 myMPI_Alltoallv(scratch, scount, soff, field, rcount, roff, 1, flag_big_all, Communicator);
140
141 Mem.myfree(roff);
142 Mem.myfree(soff);
143 Mem.myfree(rcount);
144 Mem.myfree(scount);
145}
146
156void pm_mpi_fft::my_slab_transposeB(fft_plan *plan, fft_real *field, fft_real *scratch)
157{
158 int n, prod, task, flag_big = 0, flag_big_all = 0;
159
160 size_t *scount = (size_t *)Mem.mymalloc("scount", NTask * sizeof(size_t));
161 size_t *rcount = (size_t *)Mem.mymalloc("rcount", NTask * sizeof(size_t));
162 size_t *soff = (size_t *)Mem.mymalloc("soff", NTask * sizeof(size_t));
163 size_t *roff = (size_t *)Mem.mymalloc("roff", NTask * sizeof(size_t));
164
165 for(task = 0; task < NTask; task++)
166 {
167 rcount[task] = plan->nslab_x * plan->slabs_y_per_task[task] * (plan->NgridZ * sizeof(fft_real));
168 scount[task] = plan->nslab_y * plan->slabs_x_per_task[task] * (plan->NgridZ * sizeof(fft_real));
169
170 roff[task] = plan->first_slab_y_of_task[task] * plan->nslab_x * (plan->NgridZ * sizeof(fft_real));
171 soff[task] = plan->first_slab_x_of_task[task] * plan->nslab_y * (plan->NgridZ * sizeof(fft_real));
172
173 if(scount[task] > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
174 flag_big = 1;
175 }
176
177 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
178
179 myMPI_Alltoallv(field, scount, soff, scratch, rcount, roff, 1, flag_big_all, Communicator);
180
181 Mem.myfree(roff);
182 Mem.myfree(soff);
183 Mem.myfree(rcount);
184 Mem.myfree(scount);
185
186 prod = NTask * plan->nslab_x;
187
188 for(n = 0; n < prod; n++)
189 {
190 int x = n / NTask;
191 int task = n % NTask;
192
193 int y;
194 for(y = plan->first_slab_y_of_task[task]; y < plan->first_slab_y_of_task[task] + plan->slabs_y_per_task[task]; y++)
195 memcpy(field + ((size_t)plan->Ngrid2) * (plan->NgridY * x + y),
196 scratch + ((size_t)plan->NgridZ) * (plan->first_slab_y_of_task[task] * plan->nslab_x +
197 x * plan->slabs_y_per_task[task] + (y - plan->first_slab_y_of_task[task])),
198 plan->NgridZ * sizeof(fft_real));
199 }
200}
201
202/* Given a slab-decomposed 3D field a[...] with total dimension [nx x ny x nz], whose first dimension is
203 * split across the processors, this routine outputs in b[] the transpose where then the second dimension is split
204 * across the processors. sx[] gives for each MPI task how many slabs it has, and firstx[] is the first
205 * slab for a given task. Likewise, sy[]/firsty[] gives the same thing for the transposed order. Note, the
206 * contents of the array a[] will be destroyed by the routine.
207 *
208 * An element (x,y,z) is accessed in a[] with index [([x - firstx] * ny + y) * nz + z]
209 * and in b[] as [((y - firsty) * nx + x) * nz + z]
210 *
211 * if mode = 1, the reverse operation is carried out.
212 */
213void pm_mpi_fft::my_slab_transpose(void *av, void *bv, int *sx, int *firstx, int *sy, int *firsty, int nx, int ny, int nz, int mode)
214{
215 char *a = (char *)av;
216 char *b = (char *)bv;
217
218 size_t *scount = (size_t *)Mem.mymalloc("scount", NTask * sizeof(size_t));
219 size_t *rcount = (size_t *)Mem.mymalloc("rcount", NTask * sizeof(size_t));
220 size_t *soff = (size_t *)Mem.mymalloc("soff", NTask * sizeof(size_t));
221 size_t *roff = (size_t *)Mem.mymalloc("roff", NTask * sizeof(size_t));
222 int i, n, prod, flag_big = 0, flag_big_all = 0;
223
224 for(i = 0; i < NTask; i++)
225 {
226 scount[i] = sy[i] * sx[ThisTask] * ((size_t)nz);
227 rcount[i] = sy[ThisTask] * sx[i] * ((size_t)nz);
228 soff[i] = firsty[i] * sx[ThisTask] * ((size_t)nz);
229 roff[i] = sy[ThisTask] * firstx[i] * ((size_t)nz);
230
231 if(scount[i] * sizeof(fft_complex) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
232 flag_big = 1;
233 }
234
235 /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
236 * transfer the data in chunks.
237 */
238 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
239
240 if(mode == 0)
241 {
242 /* first pack the data into contiguous blocks */
243 prod = NTask * sx[ThisTask];
244 for(n = 0; n < prod; n++)
245 {
246 int k = n / NTask;
247 int i = n % NTask;
248 int j;
249
250 for(j = 0; j < sy[i]; j++)
251 memcpy(b + (k * sy[i] + j + firsty[i] * sx[ThisTask]) * (nz * sizeof(fft_complex)),
252 a + (k * ny + (firsty[i] + j)) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
253 }
254
255 /* tranfer the data */
256 myMPI_Alltoallv(b, scount, soff, a, rcount, roff, sizeof(fft_complex), flag_big_all, Communicator);
257
258 /* unpack the data into the right order */
259 prod = NTask * sy[ThisTask];
260 for(n = 0; n < prod; n++)
261 {
262 int j = n / NTask;
263 int i = n % NTask;
264 int k;
265
266 for(k = 0; k < sx[i]; k++)
267 memcpy(b + (j * nx + k + firstx[i]) * (nz * sizeof(fft_complex)),
268 a + ((k + firstx[i]) * sy[ThisTask] + j) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
269 }
270 }
271 else
272 {
273 /* first pack the data into contiguous blocks */
274 prod = NTask * sy[ThisTask];
275 for(n = 0; n < prod; n++)
276 {
277 int j = n / NTask;
278 int i = n % NTask;
279 int k;
280
281 for(k = 0; k < sx[i]; k++)
282 memcpy(b + ((k + firstx[i]) * sy[ThisTask] + j) * (nz * sizeof(fft_complex)),
283 a + (j * nx + k + firstx[i]) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
284 }
285
286 /* tranfer the data */
287 myMPI_Alltoallv(b, rcount, roff, a, scount, soff, sizeof(fft_complex), flag_big_all, Communicator);
288
289 /* unpack the data into the right order */
290 prod = NTask * sx[ThisTask];
291 for(n = 0; n < prod; n++)
292 {
293 int k = n / NTask;
294 int i = n % NTask;
295 int j;
296
297 for(j = 0; j < sy[i]; j++)
298 memcpy(b + (k * ny + (firsty[i] + j)) * (nz * sizeof(fft_complex)),
299 a + (k * sy[i] + j + firsty[i] * sx[ThisTask]) * (nz * sizeof(fft_complex)), nz * sizeof(fft_complex));
300 }
301 }
302
303 /* now the result is in b[] */
304
305 Mem.myfree(roff);
306 Mem.myfree(soff);
307 Mem.myfree(rcount);
308 Mem.myfree(scount);
309}
310
311void pm_mpi_fft::my_slab_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
312{
313 int n, prod;
314 int slabsx = plan->slabs_x_per_task[ThisTask];
315 int slabsy = plan->slabs_y_per_task[ThisTask];
316
317 int ngridx = plan->NgridX;
318 int ngridy = plan->NgridY;
319 int ngridz = plan->Ngridz;
320 int ngridz2 = 2 * ngridz;
321
322 size_t ngridx_long = ngridx;
323 size_t ngridy_long = ngridy;
324 size_t ngridz_long = ngridz;
325 size_t ngridz2_long = ngridz2;
326
327 fft_real *data_real = (fft_real *)data;
328 fft_complex *data_complex = (fft_complex *)data, *workspace_complex = (fft_complex *)workspace;
329
330 if(forward == 1)
331 {
332 /* do the z-direction FFT, real to complex */
333 prod = slabsx * ngridy;
334 for(n = 0; n < prod; n++)
335 {
336 FFTW(execute_dft_r2c)(plan->forward_plan_zdir, data_real + n * ngridz2_long, workspace_complex + n * ngridz_long);
337 }
338
339 /* do the y-direction FFT, complex to complex */
340 prod = slabsx * ngridz;
341 for(n = 0; n < prod; n++)
342 {
343 int i = n / ngridz;
344 int j = n % ngridz;
345
346 FFTW(execute_dft)
347 (plan->forward_plan_ydir, workspace_complex + i * ngridz * ngridy_long + j, data_complex + i * ngridz * ngridy_long + j);
348 }
349
350 /* now our data resides in data_complex[] */
351
352 /* do the transpose */
353 my_slab_transpose(data_complex, workspace_complex, plan->slabs_x_per_task, plan->first_slab_x_of_task, plan->slabs_y_per_task,
354 plan->first_slab_y_of_task, ngridx, ngridy, ngridz, 0);
355
356 /* now the data is in workspace_complex[] */
357
358 /* finally, do the transform along the x-direction (we are in transposed order, x and y have interchanged */
359 prod = slabsy * ngridz;
360 for(n = 0; n < prod; n++)
361 {
362 int i = n / ngridz;
363 int j = n % ngridz;
364
365 FFTW(execute_dft)
366 (plan->forward_plan_xdir, workspace_complex + i * ngridz * ngridx_long + j, data_complex + i * ngridz * ngridx_long + j);
367 }
368
369 /* now the result is in data_complex[] */
370 }
371 else
372 {
373 prod = slabsy * ngridz;
374
375 for(n = 0; n < prod; n++)
376 {
377 int i = n / ngridz;
378 int j = n % ngridz;
379
380 FFTW(execute_dft)
381 (plan->backward_plan_xdir, data_complex + i * ngridz * ngridx_long + j, workspace_complex + i * ngridz * ngridx_long + j);
382 }
383
384 my_slab_transpose(workspace_complex, data_complex, plan->slabs_x_per_task, plan->first_slab_x_of_task, plan->slabs_y_per_task,
385 plan->first_slab_y_of_task, ngridx, ngridy, ngridz, 1);
386
387 prod = slabsx * ngridz;
388
389 for(n = 0; n < prod; n++)
390 {
391 int i = n / ngridz;
392 int j = n % ngridz;
393
394 FFTW(execute_dft)
395 (plan->backward_plan_ydir, data_complex + i * ngridz * ngridy_long + j, workspace_complex + i * ngridz * ngridy_long + j);
396 }
397
398 prod = slabsx * ngridy;
399
400 for(n = 0; n < prod; n++)
401 {
402 FFTW(execute_dft_c2r)(plan->backward_plan_zdir, workspace_complex + n * ngridz_long, data_real + n * ngridz2_long);
403 }
404
405 /* now the result is in data[] */
406 }
407}
408
409#else
410
411void pm_mpi_fft::my_column_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
412{
413 plan->NgridX = NgridX;
414 plan->NgridY = NgridY;
415 plan->NgridZ = NgridZ;
416
417 int Ngridz = NgridZ / 2 + 1;
418
419 plan->Ngridz = Ngridz;
420 plan->Ngrid2 = 2 * Ngridz;
421
422 subdivide_evenly(plan->NgridX * plan->NgridY, NTask, ThisTask, &plan->firstcol_XY, &plan->ncol_XY);
423 subdivide_evenly(plan->NgridX * plan->Ngrid2, NTask, ThisTask, &plan->firstcol_XZ, &plan->ncol_XZ);
424 subdivide_evenly(plan->Ngrid2 * plan->NgridY, NTask, ThisTask, &plan->firstcol_ZY, &plan->ncol_ZY);
425
426 plan->lastcol_XY = plan->firstcol_XY + plan->ncol_XY - 1;
427 plan->lastcol_XZ = plan->firstcol_XZ + plan->ncol_XZ - 1;
428 plan->lastcol_ZY = plan->firstcol_ZY + plan->ncol_ZY - 1;
429
430 subdivide_evenly(NgridX * Ngridz, NTask, ThisTask, &plan->transposed_firstcol, &plan->transposed_ncol);
431 subdivide_evenly(NgridY * Ngridz, NTask, ThisTask, &plan->second_transposed_firstcol, &plan->second_transposed_ncol);
432
433 plan->second_transposed_ncells = ((size_t)plan->NgridX) * plan->second_transposed_ncol;
434
435 plan->max_datasize = ((size_t)plan->Ngrid2) * plan->ncol_XY;
436 plan->max_datasize = std::max<size_t>(plan->max_datasize, 2 * ((size_t)plan->NgridY) * plan->transposed_ncol);
437 plan->max_datasize = std::max<size_t>(plan->max_datasize, 2 * ((size_t)plan->NgridX) * plan->second_transposed_ncol);
438 plan->max_datasize = std::max<size_t>(plan->max_datasize, ((size_t)plan->ncol_XZ) * plan->NgridY);
439 plan->max_datasize = std::max<size_t>(plan->max_datasize, ((size_t)plan->ncol_ZY) * plan->NgridX);
440
441 plan->fftsize = plan->max_datasize;
442
443 plan->offsets_send_A = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_A, "offsets_send_A", NTask * sizeof(size_t));
444 plan->offsets_recv_A = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_A, "offsets_recv_A", NTask * sizeof(size_t));
445 plan->offsets_send_B = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_B, "offsets_send_B", NTask * sizeof(size_t));
446 plan->offsets_recv_B = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_B, "offsets_recv_B", NTask * sizeof(size_t));
447 plan->offsets_send_C = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_C, "offsets_send_C", NTask * sizeof(size_t));
448 plan->offsets_recv_C = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_C, "offsets_recv_C", NTask * sizeof(size_t));
449 plan->offsets_send_D = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_send_D, "offsets_send_D", NTask * sizeof(size_t));
450 plan->offsets_recv_D = (size_t *)Mem.mymalloc_movable_clear(&plan->offsets_recv_D, "offsets_recv_D", NTask * sizeof(size_t));
451
452 plan->count_send_A = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_A, "count_send_A", NTask * sizeof(size_t));
453 plan->count_recv_A = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_A, "count_recv_A", NTask * sizeof(size_t));
454 plan->count_send_B = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_B, "count_send_B", NTask * sizeof(size_t));
455 plan->count_recv_B = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_B, "count_recv_B", NTask * sizeof(size_t));
456 plan->count_send_C = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_C, "count_send_C", NTask * sizeof(size_t));
457 plan->count_recv_C = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_C, "count_recv_C", NTask * sizeof(size_t));
458 plan->count_send_D = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_D, "count_send_D", NTask * sizeof(size_t));
459 plan->count_recv_D = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_D, "count_recv_D", NTask * sizeof(size_t));
460 plan->count_send_13 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_13, "count_send_13", NTask * sizeof(size_t));
461 plan->count_recv_13 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_13, "count_recv_13", NTask * sizeof(size_t));
462 plan->count_send_23 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_23, "count_send_23", NTask * sizeof(size_t));
463 plan->count_recv_23 = (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_23, "count_recv_23", NTask * sizeof(size_t));
464 plan->count_send_13back =
465 (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_13back, "count_send_13back", NTask * sizeof(size_t));
466 plan->count_recv_13back =
467 (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_13back, "count_recv_13back", NTask * sizeof(size_t));
468 plan->count_send_23back =
469 (size_t *)Mem.mymalloc_movable_clear(&plan->count_send_23back, "count_send_23back", NTask * sizeof(size_t));
470 plan->count_recv_23back =
471 (size_t *)Mem.mymalloc_movable_clear(&plan->count_recv_23back, "count_recv_23back", NTask * sizeof(size_t));
472
473 int dimA[3] = {plan->NgridX, plan->NgridY, plan->Ngridz};
474 int permA[3] = {0, 2, 1};
475
476 my_fft_column_remap(NULL, dimA, plan->firstcol_XY, plan->ncol_XY, NULL, permA, plan->transposed_firstcol, plan->transposed_ncol,
477 plan->offsets_send_A, plan->offsets_recv_A, plan->count_send_A, plan->count_recv_A, 1);
478
479 int dimB[3] = {plan->NgridX, plan->Ngridz, plan->NgridY};
480 int permB[3] = {2, 1, 0};
481
482 my_fft_column_remap(NULL, dimB, plan->transposed_firstcol, plan->transposed_ncol, NULL, permB, plan->second_transposed_firstcol,
483 plan->second_transposed_ncol, plan->offsets_send_B, plan->offsets_recv_B, plan->count_send_B, plan->count_recv_B,
484 1);
485
486 int dimC[3] = {plan->NgridY, plan->Ngridz, plan->NgridX};
487 int permC[3] = {2, 1, 0};
488
489 my_fft_column_remap(NULL, dimC, plan->second_transposed_firstcol, plan->second_transposed_ncol, NULL, permC,
490 plan->transposed_firstcol, plan->transposed_ncol, plan->offsets_send_C, plan->offsets_recv_C, plan->count_send_C,
491 plan->count_recv_C, 1);
492
493 int dimD[3] = {plan->NgridX, plan->Ngridz, plan->NgridY};
494 int permD[3] = {0, 2, 1};
495
496 my_fft_column_remap(NULL, dimD, plan->transposed_firstcol, plan->transposed_ncol, NULL, permD, plan->firstcol_XY, plan->ncol_XY,
497 plan->offsets_send_D, plan->offsets_recv_D, plan->count_send_D, plan->count_recv_D, 1);
498
499 int dim23[3] = {plan->NgridX, plan->NgridY, plan->Ngrid2};
500 int perm23[3] = {0, 2, 1};
501
502 my_fft_column_transpose(NULL, dim23, plan->firstcol_XY, plan->ncol_XY, NULL, perm23, plan->firstcol_XZ, plan->ncol_XZ,
503 plan->count_send_23, plan->count_recv_23, 1);
504
505 int dim23back[3] = {plan->NgridX, plan->Ngrid2, plan->NgridY};
506 int perm23back[3] = {0, 2, 1};
507
508 my_fft_column_transpose(NULL, dim23back, plan->firstcol_XZ, plan->ncol_XZ, NULL, perm23back, plan->firstcol_XY, plan->ncol_XY,
509 plan->count_send_23back, plan->count_recv_23back, 1);
510
511 int dim13[3] = {plan->NgridX, plan->NgridY, plan->Ngrid2};
512 int perm13[3] = {2, 1, 0};
513
514 my_fft_column_transpose(NULL, dim13, plan->firstcol_XY, plan->ncol_XY, NULL, perm13, plan->firstcol_ZY, plan->ncol_ZY,
515 plan->count_send_13, plan->count_recv_13, 1);
516
517 int dim13back[3] = {plan->Ngrid2, plan->NgridY, plan->NgridX};
518 int perm13back[3] = {2, 1, 0};
519
520 my_fft_column_transpose(NULL, dim13back, plan->firstcol_ZY, plan->ncol_ZY, NULL, perm13back, plan->firstcol_XY, plan->ncol_XY,
521 plan->count_send_13back, plan->count_recv_13back, 1);
522}
523
524void pm_mpi_fft::my_column_based_fft_free(fft_plan *plan)
525{
526 Mem.myfree(plan->count_recv_23back);
527 Mem.myfree(plan->count_send_23back);
528 Mem.myfree(plan->count_recv_13back);
529 Mem.myfree(plan->count_send_13back);
530 Mem.myfree(plan->count_recv_23);
531 Mem.myfree(plan->count_send_23);
532 Mem.myfree(plan->count_recv_13);
533 Mem.myfree(plan->count_send_13);
534 Mem.myfree(plan->count_recv_D);
535 Mem.myfree(plan->count_send_D);
536 Mem.myfree(plan->count_recv_C);
537 Mem.myfree(plan->count_send_C);
538 Mem.myfree(plan->count_recv_B);
539 Mem.myfree(plan->count_send_B);
540 Mem.myfree(plan->count_recv_A);
541 Mem.myfree(plan->count_send_A);
542
543 Mem.myfree(plan->offsets_recv_D);
544 Mem.myfree(plan->offsets_send_D);
545 Mem.myfree(plan->offsets_recv_C);
546 Mem.myfree(plan->offsets_send_C);
547 Mem.myfree(plan->offsets_recv_B);
548 Mem.myfree(plan->offsets_send_B);
549 Mem.myfree(plan->offsets_recv_A);
550 Mem.myfree(plan->offsets_send_A);
551}
552
553void pm_mpi_fft::my_fft_swap23(fft_plan *plan, fft_real *data, fft_real *out)
554{
555 int dim23[3] = {plan->NgridX, plan->NgridY, plan->Ngrid2};
556 int perm23[3] = {0, 2, 1};
557
558 my_fft_column_transpose(data, dim23, plan->firstcol_XY, plan->ncol_XY, out, perm23, plan->firstcol_XZ, plan->ncol_XZ,
559 plan->count_send_23, plan->count_recv_23, 0);
560}
561
562void pm_mpi_fft::my_fft_swap23back(fft_plan *plan, fft_real *data, fft_real *out)
563{
564 int dim23back[3] = {plan->NgridX, plan->Ngrid2, plan->NgridY};
565 int perm23back[3] = {0, 2, 1};
566
567 my_fft_column_transpose(data, dim23back, plan->firstcol_XZ, plan->ncol_XZ, out, perm23back, plan->firstcol_XY, plan->ncol_XY,
568 plan->count_send_23back, plan->count_recv_23back, 0);
569}
570
571void pm_mpi_fft::my_fft_swap13(fft_plan *plan, fft_real *data, fft_real *out)
572{
573 int dim13[3] = {plan->NgridX, plan->NgridY, plan->Ngrid2};
574 int perm13[3] = {2, 1, 0};
575
576 my_fft_column_transpose(data, dim13, plan->firstcol_XY, plan->ncol_XY, out, perm13, plan->firstcol_ZY, plan->ncol_ZY,
577 plan->count_send_13, plan->count_recv_13, 0);
578}
579
580void pm_mpi_fft::my_fft_swap13back(fft_plan *plan, fft_real *data, fft_real *out)
581{
582 int dim13back[3] = {plan->Ngrid2, plan->NgridY, plan->NgridX};
583 int perm13back[3] = {2, 1, 0};
584
585 my_fft_column_transpose(data, dim13back, plan->firstcol_ZY, plan->ncol_ZY, out, perm13back, plan->firstcol_XY, plan->ncol_XY,
586 plan->count_send_13back, plan->count_recv_13back, 0);
587}
588
589void pm_mpi_fft::my_column_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
590{
591 size_t n;
592 fft_real *data_real = (fft_real *)data, *workspace_real = (fft_real *)workspace;
593 fft_complex *data_complex = (fft_complex *)data, *workspace_complex = (fft_complex *)workspace;
594
595 if(forward == 1)
596 {
597 /* do the z-direction FFT, real to complex */
598 for(n = 0; n < plan->ncol_XY; n++)
599 FFTW(execute_dft_r2c)(plan->forward_plan_zdir, data_real + n * plan->Ngrid2, workspace_complex + n * plan->Ngridz);
600
601 int dimA[3] = {plan->NgridX, plan->NgridY, plan->Ngridz};
602 int permA[3] = {0, 2, 1};
603
604 my_fft_column_remap(workspace_complex, dimA, plan->firstcol_XY, plan->ncol_XY, data_complex, permA, plan->transposed_firstcol,
605 plan->transposed_ncol, plan->offsets_send_A, plan->offsets_recv_A, plan->count_send_A, plan->count_recv_A,
606 0);
607
608 /* do the y-direction FFT in 'data', complex to complex */
609 for(n = 0; n < plan->transposed_ncol; n++)
610 FFTW(execute_dft)(plan->forward_plan_ydir, data_complex + n * plan->NgridY, workspace_complex + n * plan->NgridY);
611
612 int dimB[3] = {plan->NgridX, plan->Ngridz, plan->NgridY};
613 int permB[3] = {2, 1, 0};
614
615 my_fft_column_remap(workspace_complex, dimB, plan->transposed_firstcol, plan->transposed_ncol, data_complex, permB,
616 plan->second_transposed_firstcol, plan->second_transposed_ncol, plan->offsets_send_B, plan->offsets_recv_B,
617 plan->count_send_B, plan->count_recv_B, 0);
618
619 /* do the x-direction FFT in 'data', complex to complex */
620 for(n = 0; n < plan->second_transposed_ncol; n++)
621 FFTW(execute_dft)(plan->forward_plan_xdir, data_complex + n * plan->NgridX, workspace_complex + n * plan->NgridX);
622
623 /* result is now in workspace */
624 }
625 else
626 {
627 /* do inverse FFT in 'data' */
628 for(n = 0; n < plan->second_transposed_ncol; n++)
629 FFTW(execute_dft)(plan->backward_plan_xdir, data_complex + n * plan->NgridX, workspace_complex + n * plan->NgridX);
630
631 int dimC[3] = {plan->NgridY, plan->Ngridz, plan->NgridX};
632 int permC[3] = {2, 1, 0};
633
634 my_fft_column_remap(workspace_complex, dimC, plan->second_transposed_firstcol, plan->second_transposed_ncol, data_complex, permC,
635 plan->transposed_firstcol, plan->transposed_ncol, plan->offsets_send_C, plan->offsets_recv_C,
636 plan->count_send_C, plan->count_recv_C, 0);
637
638 /* do inverse FFT in 'data' */
639 for(n = 0; n < plan->transposed_ncol; n++)
640 FFTW(execute_dft)(plan->backward_plan_ydir, data_complex + n * plan->NgridY, workspace_complex + n * plan->NgridY);
641
642 int dimD[3] = {plan->NgridX, plan->Ngridz, plan->NgridY};
643 int permD[3] = {0, 2, 1};
644
645 my_fft_column_remap(workspace_complex, dimD, plan->transposed_firstcol, plan->transposed_ncol, data_complex, permD,
646 plan->firstcol_XY, plan->ncol_XY, plan->offsets_send_D, plan->offsets_recv_D, plan->count_send_D,
647 plan->count_recv_D, 0);
648
649 /* do complex-to-real inverse transform on z-coordinates */
650 for(n = 0; n < plan->ncol_XY; n++)
651 FFTW(execute_dft_c2r)(plan->backward_plan_zdir, data_complex + n * plan->Ngridz, workspace_real + n * plan->Ngrid2);
652 }
653}
654
655void pm_mpi_fft::my_fft_column_remap(fft_complex *data, int Ndims[3], /* global dimensions of data cube */
656 int in_firstcol, int in_ncol, /* first column and number of columns */
657 fft_complex *out, int perm[3], int out_firstcol, int out_ncol, size_t *offset_send,
658 size_t *offset_recv, size_t *count_send, size_t *count_recv, size_t just_count_flag)
659{
660 int j, target, origin, ngrp, recvTask, perm_rev[3], xyz[3], uvw[3];
661 size_t nimport, nexport;
662
663 /* determine the inverse permutation */
664 for(j = 0; j < 3; j++)
665 perm_rev[j] = perm[j];
666
667 if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2)) /* not yet the inverse */
668 {
669 for(j = 0; j < 3; j++)
670 perm_rev[j] = perm[perm[j]];
671
672 if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2))
673 Terminate("bummer");
674 }
675
676 int in_colums = Ndims[0] * Ndims[1];
677 int in_avg = (in_colums - 1) / NTask + 1;
678 int in_exc = NTask * in_avg - in_colums;
679 int in_tasklastsection = NTask - in_exc;
680 int in_pivotcol = in_tasklastsection * in_avg;
681
682 int out_colums = Ndims[perm[0]] * Ndims[perm[1]];
683 int out_avg = (out_colums - 1) / NTask + 1;
684 int out_exc = NTask * out_avg - out_colums;
685 int out_tasklastsection = NTask - out_exc;
686 int out_pivotcol = out_tasklastsection * out_avg;
687
688 size_t i, ncells = ((size_t)in_ncol) * Ndims[2];
689
690 xyz[0] = in_firstcol / Ndims[1];
691 xyz[1] = in_firstcol % Ndims[1];
692 xyz[2] = 0;
693
694 memset(count_send, 0, NTask * sizeof(size_t));
695
696 /* loop over all cells in input array and determine target processor */
697 for(i = 0; i < ncells; i++)
698 {
699 /* determine target task */
700 uvw[0] = xyz[perm[0]];
701 uvw[1] = xyz[perm[1]];
702 uvw[2] = xyz[perm[2]];
703
704 int newcol = Ndims[perm[1]] * uvw[0] + uvw[1];
705 if(newcol < out_pivotcol)
706 target = newcol / out_avg;
707 else
708 target = (newcol - out_pivotcol) / (out_avg - 1) + out_tasklastsection;
709
710 /* move data element to targettask */
711
712 if(just_count_flag)
713 count_send[target]++;
714 else
715 {
716 size_t off = offset_send[target] + count_send[target]++;
717 out[off][0] = data[i][0];
718 out[off][1] = data[i][1];
719 }
720 xyz[2]++;
721 if(xyz[2] == Ndims[2])
722 {
723 xyz[2] = 0;
724 xyz[1]++;
725 if(xyz[1] == Ndims[1])
726 {
727 xyz[1] = 0;
728 xyz[0]++;
729 }
730 }
731 }
732
733 if(just_count_flag)
734 {
735 myMPI_Alltoall(count_send, sizeof(size_t), MPI_BYTE, count_recv, sizeof(size_t), MPI_BYTE, Communicator);
736
737 for(j = 0, nimport = 0, nexport = 0, offset_send[0] = 0, offset_recv[0] = 0; j < NTask; j++)
738 {
739 nexport += count_send[j];
740 nimport += count_recv[j];
741
742 if(j > 0)
743 {
744 offset_send[j] = offset_send[j - 1] + count_send[j - 1];
745 offset_recv[j] = offset_recv[j - 1] + count_recv[j - 1];
746 }
747 }
748
749 if(nexport != ncells)
750 Terminate("nexport=%lld != ncells=%lld", (long long)nexport, (long long)ncells);
751 }
752 else
753 {
754 nimport = 0;
755
756 /* exchange all the data */
757 for(ngrp = 0; ngrp < (1 << PTask); ngrp++)
758 {
759 recvTask = ThisTask ^ ngrp;
760
761 if(recvTask < NTask)
762 {
763 if(count_send[recvTask] > 0 || count_recv[recvTask] > 0)
764 myMPI_Sendrecv(&out[offset_send[recvTask]], count_send[recvTask] * sizeof(fft_complex), MPI_BYTE, recvTask, TAG_DENS_A,
765 &data[offset_recv[recvTask]], count_recv[recvTask] * sizeof(fft_complex), MPI_BYTE, recvTask,
766 TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
767
768 nimport += count_recv[recvTask];
769 }
770 }
771
772 /* now loop over the new cell layout */
773 /* find enclosing rectangle around columns in new plane */
774
775 int first[3], last[3];
776
777 first[0] = out_firstcol / Ndims[perm[1]];
778 first[1] = out_firstcol % Ndims[perm[1]];
779 first[2] = 0;
780
781 last[0] = (out_firstcol + out_ncol - 1) / Ndims[perm[1]];
782 last[1] = (out_firstcol + out_ncol - 1) % Ndims[perm[1]];
783 last[2] = Ndims[perm[2]] - 1;
784
785 if(first[1] + out_ncol >= Ndims[perm[1]])
786 {
787 first[1] = 0;
788 last[1] = Ndims[perm[1]] - 1;
789 }
790
791 /* now need to map this back to the old coordinates */
792
793 int xyz_first[3], xyz_last[3];
794
795 for(j = 0; j < 3; j++)
796 {
797 xyz_first[j] = first[perm_rev[j]];
798 xyz_last[j] = last[perm_rev[j]];
799 }
800
801 memset(count_recv, 0, NTask * sizeof(size_t));
802
803 size_t count = 0;
804
805 /* traverse an enclosing box around the new cell layout in the old order */
806 for(xyz[0] = xyz_first[0]; xyz[0] <= xyz_last[0]; xyz[0]++)
807 for(xyz[1] = xyz_first[1]; xyz[1] <= xyz_last[1]; xyz[1]++)
808 for(xyz[2] = xyz_first[2]; xyz[2] <= xyz_last[2]; xyz[2]++)
809 {
810 /* check that the point is actually part of a column */
811 uvw[0] = xyz[perm[0]];
812 uvw[1] = xyz[perm[1]];
813 uvw[2] = xyz[perm[2]];
814
815 int col = uvw[0] * Ndims[perm[1]] + uvw[1];
816
817 if(col >= out_firstcol && col < out_firstcol + out_ncol)
818 {
819 /* determine origin task */
820 int newcol = Ndims[1] * xyz[0] + xyz[1];
821 if(newcol < in_pivotcol)
822 origin = newcol / in_avg;
823 else
824 origin = (newcol - in_pivotcol) / (in_avg - 1) + in_tasklastsection;
825
826 size_t index = ((size_t)Ndims[perm[2]]) * (col - out_firstcol) + uvw[2];
827
828 /* move data element from origin task */
829 size_t off = offset_recv[origin] + count_recv[origin]++;
830 out[index][0] = data[off][0];
831 out[index][1] = data[off][1];
832
833 count++;
834 }
835 }
836
837 if(count != nimport)
838 {
839 int fi = out_firstcol % Ndims[perm[1]];
840 int la = (out_firstcol + out_ncol - 1) % Ndims[perm[1]];
841
842 Terminate("count=%lld nimport=%lld ncol=%d fi=%d la=%d first=%d last=%d\n", (long long)count, (long long)nimport, out_ncol,
843 fi, la, first[1], last[1]);
844 }
845 }
846}
847
848void pm_mpi_fft::my_fft_column_transpose(fft_real *data, int Ndims[3], /* global dimensions of data cube */
849 int in_firstcol, int in_ncol, /* first column and number of columns */
850 fft_real *out, int perm[3], int out_firstcol, int out_ncol, size_t *count_send,
851 size_t *count_recv, size_t just_count_flag)
852{
853 /* determine the inverse permutation */
854 int perm_rev[3];
855 for(int j = 0; j < 3; j++)
856 perm_rev[j] = perm[j];
857
858 if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2)) /* not yet the inverse */
859 {
860 for(int j = 0; j < 3; j++)
861 perm_rev[j] = perm[perm[j]];
862
863 if(!(perm_rev[perm[0]] == 0 && perm_rev[perm[1]] == 1 && perm_rev[perm[2]] == 2))
864 Terminate("bummer");
865 }
866
867 int in_colums = Ndims[0] * Ndims[1];
868 int in_avg = (in_colums - 1) / NTask + 1;
869 int in_exc = NTask * in_avg - in_colums;
870 int in_tasklastsection = NTask - in_exc;
871 int in_pivotcol = in_tasklastsection * in_avg;
872
873 int out_colums = Ndims[perm[0]] * Ndims[perm[1]];
874 int out_avg = (out_colums - 1) / NTask + 1;
875 int out_exc = NTask * out_avg - out_colums;
876 int out_tasklastsection = NTask - out_exc;
877 int out_pivotcol = out_tasklastsection * out_avg;
878
879 if(just_count_flag)
880 memset(count_send, 0, NTask * sizeof(size_t));
881
882 /* exchange all the data */
883 for(int ngrp = 0; ngrp < (1 << PTask); ngrp++)
884 {
885 int target = ThisTask ^ ngrp;
886
887 if(target < NTask)
888 {
889 // check whether we have anything to do
890 if(count_send[target] == 0 && count_recv[target] == 0 && just_count_flag == 0)
891 continue;
892
893 /* determine enclosing rectangle of current region */
894 int source_first[3];
895 source_first[0] = in_firstcol / Ndims[1];
896 source_first[1] = in_firstcol % Ndims[1];
897 source_first[2] = 0;
898
899 int source_last[3];
900 source_last[0] = (in_firstcol + in_ncol - 1) / Ndims[1];
901 source_last[1] = (in_firstcol + in_ncol - 1) % Ndims[1];
902 source_last[2] = Ndims[2] - 1;
903
904 if(source_first[1] + in_ncol >= Ndims[1])
905 {
906 source_first[1] = 0;
907 source_last[1] = Ndims[1] - 1;
908 }
909
910 /* determine target columns */
911
912 int target_first_col = 0;
913 int long target_num_col = 0;
914
915 if(target < out_tasklastsection)
916 {
917 target_first_col = target * out_avg;
918 target_num_col = out_avg;
919 }
920 else
921 {
922 target_first_col = (target - out_tasklastsection) * (out_avg - 1) + out_pivotcol;
923 target_num_col = (out_avg - 1);
924 }
925
926 /* find enclosing rectangle around columns in new plane */
927 int first[3], last[3];
928
929 first[0] = target_first_col / Ndims[perm[1]];
930 first[1] = target_first_col % Ndims[perm[1]];
931 first[2] = 0;
932
933 last[0] = (target_first_col + target_num_col - 1) / Ndims[perm[1]];
934 last[1] = (target_first_col + target_num_col - 1) % Ndims[perm[1]];
935 last[2] = Ndims[perm[2]] - 1;
936
937 if(first[1] + target_num_col >= Ndims[perm[1]])
938 {
939 first[1] = 0;
940 last[1] = Ndims[perm[1]] - 1;
941 }
942
943 /* now we map this back to the old coordinates */
944 int xyz_first[3], xyz_last[3];
945
946 for(int j = 0; j < 3; j++)
947 {
948 xyz_first[j] = first[perm_rev[j]];
949 xyz_last[j] = last[perm_rev[j]];
950 }
951
952 /* determine common box */
953 int xyz_start[3], xyz_end[3];
954 for(int j = 0; j < 3; j++)
955 {
956 xyz_start[j] = std::max<int>(xyz_first[j], source_first[j]);
957 xyz_end[j] = std::min<int>(xyz_last[j], source_last[j]);
958 }
959
960 int xyz[3];
961 for(int j = 0; j < 3; j++)
962 xyz[j] = xyz_start[j];
963
964 /* now do the same determination for the flipped situation on the target side */
965
966 int flip_in_firstcol = 0;
967 int flip_in_ncol = 0;
968
969 if(target < in_tasklastsection)
970 {
971 flip_in_firstcol = target * in_avg;
972 flip_in_ncol = in_avg;
973 }
974 else
975 {
976 flip_in_firstcol = (target - in_tasklastsection) * (in_avg - 1) + in_pivotcol;
977 flip_in_ncol = (in_avg - 1);
978 }
979
980 /* determine enclosing rectangle of current region */
981 int flip_source_first[3];
982 flip_source_first[0] = flip_in_firstcol / Ndims[1];
983 flip_source_first[1] = flip_in_firstcol % Ndims[1];
984 flip_source_first[2] = 0;
985
986 int flip_source_last[3];
987 flip_source_last[0] = (flip_in_firstcol + flip_in_ncol - 1) / Ndims[1];
988 flip_source_last[1] = (flip_in_firstcol + flip_in_ncol - 1) % Ndims[1];
989 flip_source_last[2] = Ndims[2] - 1;
990
991 if(flip_source_first[1] + flip_in_ncol >= Ndims[1])
992 {
993 flip_source_first[1] = 0;
994 flip_source_last[1] = Ndims[1] - 1;
995 }
996
997 /* determine target columns */
998
999 int flip_first_col = 0;
1000 int flip_num_col = 0;
1001
1002 if(ThisTask < out_tasklastsection)
1003 {
1004 flip_first_col = ThisTask * out_avg;
1005 flip_num_col = out_avg;
1006 }
1007 else
1008 {
1009 flip_first_col = (ThisTask - out_tasklastsection) * (out_avg - 1) + out_pivotcol;
1010 flip_num_col = (out_avg - 1);
1011 }
1012
1013 /* find enclosing rectangle around columns in new plane */
1014 int flip_first[3], flip_last[3];
1015
1016 flip_first[0] = flip_first_col / Ndims[perm[1]];
1017 flip_first[1] = flip_first_col % Ndims[perm[1]];
1018 flip_first[2] = 0;
1019
1020 flip_last[0] = (flip_first_col + flip_num_col - 1) / Ndims[perm[1]];
1021 flip_last[1] = (flip_first_col + flip_num_col - 1) % Ndims[perm[1]];
1022 flip_last[2] = Ndims[perm[2]] - 1;
1023
1024 if(flip_first[1] + flip_num_col >= Ndims[perm[1]])
1025 {
1026 flip_first[1] = 0;
1027 flip_last[1] = Ndims[perm[1]] - 1;
1028 }
1029
1030 /* now we map this back to the old coordinates */
1031 int abc_first[3], abc_last[3];
1032
1033 for(int j = 0; j < 3; j++)
1034 {
1035 abc_first[j] = flip_first[perm_rev[j]];
1036 abc_last[j] = flip_last[perm_rev[j]];
1037 }
1038
1039 /* determine common box */
1040 int abc_start[3], abc_end[3];
1041 for(int j = 0; j < 3; j++)
1042 {
1043 abc_start[j] = std::max<int>(abc_first[j], flip_source_first[j]);
1044 abc_end[j] = std::min<int>(abc_last[j], flip_source_last[j]);
1045 }
1046
1047 int abc[3];
1048
1049 for(int j = 0; j < 3; j++)
1050 abc[j] = abc_start[j];
1051
1052 size_t tot_count_send = 0;
1053 size_t tot_count_recv = 0;
1054
1055 /* now check how much free memory there is on the two partners, use at most half of it */
1056 size_t parnter_freebytes;
1057 myMPI_Sendrecv(&Mem.FreeBytes, sizeof(size_t), MPI_BYTE, target, TAG_DENS_B, &parnter_freebytes, sizeof(size_t), MPI_BYTE,
1058 target, TAG_DENS_B, Communicator, MPI_STATUS_IGNORE);
1059
1060 size_t freeb = std::min<size_t>(parnter_freebytes, Mem.FreeBytes);
1061
1062 size_t limit = 0.5 * freeb / (sizeof(fft_real) + sizeof(fft_real));
1063
1064 if(just_count_flag)
1065 limit = SIZE_MAX;
1066
1067 int iter = 0;
1068 do
1069 {
1070 size_t limit_send = count_send[target] - tot_count_send;
1071 size_t limit_recv = count_recv[target] - tot_count_recv;
1072
1073 if(just_count_flag)
1074 {
1075 limit_send = SIZE_MAX;
1076 limit_recv = SIZE_MAX;
1077 }
1078 else
1079 {
1080 if(limit_send > limit)
1081 limit_send = limit;
1082
1083 if(limit_recv > limit)
1084 limit_recv = limit;
1085 }
1086
1087 fft_real *buffer_send = NULL;
1088 fft_real *buffer_recv = NULL;
1089
1090 if(just_count_flag == 0)
1091 {
1092 buffer_send = (fft_real *)Mem.mymalloc("buffer_send", limit_send * sizeof(fft_real));
1093 buffer_recv = (fft_real *)Mem.mymalloc("buffer_recv", limit_recv * sizeof(fft_real));
1094 }
1095
1096 /* traverse the common box between the new and old layout */
1097 size_t count = 0;
1098
1099 while(count < limit_send && xyz[0] <= xyz_end[0] && xyz[1] <= xyz_end[1] && xyz[2] <= xyz_end[2])
1100 {
1101 /* check that the point is actually part of a column in the old layout */
1102 int col_old = xyz[0] * Ndims[1] + xyz[1];
1103
1104 if(col_old >= in_firstcol && col_old < in_firstcol + in_ncol)
1105 {
1106 /* check that the point is actually part of a column in the new layout */
1107 int uvw[3];
1108 uvw[0] = xyz[perm[0]];
1109 uvw[1] = xyz[perm[1]];
1110 uvw[2] = xyz[perm[2]];
1111
1112 int col_new = uvw[0] * Ndims[perm[1]] + uvw[1];
1113
1114 if(col_new >= target_first_col && col_new < target_first_col + target_num_col)
1115 {
1116 // ok, we found a match
1117
1118 if(just_count_flag)
1119 count_send[target]++;
1120 else
1121 {
1122 long long source_cell = (Ndims[1] * xyz[0] + xyz[1] - in_firstcol) * Ndims[2] + xyz[2];
1123
1124 buffer_send[count++] = data[source_cell];
1125 tot_count_send++;
1126 }
1127 }
1128 }
1129
1130 xyz[2]++;
1131 if(xyz[2] > xyz_end[2])
1132 {
1133 xyz[2] = xyz_start[2];
1134 xyz[1]++;
1135 if(xyz[1] > xyz_end[1])
1136 {
1137 xyz[1] = xyz_start[1];
1138 xyz[0]++;
1139 }
1140 }
1141 }
1142
1143 if(just_count_flag == 0)
1144 {
1145 myMPI_Sendrecv(buffer_send, limit_send * sizeof(fft_real), MPI_BYTE, target, TAG_DENS_A, buffer_recv,
1146 limit_recv * sizeof(fft_real), MPI_BYTE, target, TAG_DENS_A, Communicator, MPI_STATUS_IGNORE);
1147
1148 size_t count = 0;
1149 while(count < limit_recv && abc[0] <= abc_end[0] && abc[1] <= abc_end[1] && abc[2] <= abc_end[2])
1150 {
1151 /* check that the point is actually part of a column in the old layout */
1152 int col_old = abc[0] * Ndims[1] + abc[1];
1153
1154 if(col_old >= flip_in_firstcol && col_old < flip_in_firstcol + flip_in_ncol)
1155 {
1156 /* check that the point is actually part of a column in the new layout */
1157 int uvw[3];
1158 uvw[0] = abc[perm[0]];
1159 uvw[1] = abc[perm[1]];
1160 uvw[2] = abc[perm[2]];
1161
1162 int col_new = uvw[0] * Ndims[perm[1]] + uvw[1];
1163
1164 if(col_new >= flip_first_col && col_new < flip_first_col + flip_num_col)
1165 {
1166 // ok, we found a match
1167
1168 long long target_cell = (Ndims[perm[1]] * uvw[0] + uvw[1] - flip_first_col) * Ndims[perm[2]] + uvw[2];
1169
1170 out[target_cell] = buffer_recv[count++];
1171 tot_count_recv++;
1172 }
1173 }
1174
1175 abc[2]++;
1176 if(abc[2] > abc_end[2])
1177 {
1178 abc[2] = abc_start[2];
1179 abc[1]++;
1180 if(abc[1] > abc_end[1])
1181 {
1182 abc[1] = abc_start[1];
1183 abc[0]++;
1184 }
1185 }
1186 }
1187
1188 Mem.myfree(buffer_recv);
1189 Mem.myfree(buffer_send);
1190 }
1191 else
1192 break;
1193
1194 iter++;
1195
1196 if(iter > 20)
1197 Terminate("high number of iterations: limit=%lld", (long long)limit);
1198 }
1199 while(tot_count_send < count_send[target] || tot_count_recv < count_recv[target]);
1200 }
1201 }
1202 if(just_count_flag)
1203 myMPI_Alltoall(count_send, sizeof(size_t), MPI_BYTE, count_recv, sizeof(size_t), MPI_BYTE, Communicator);
1204}
1205
1206#endif
1207
1208#endif
size_t FreeBytes
Definition: mymalloc.h:48
void my_slab_based_fft_free(fft_plan *plan)
void my_column_based_fft_free(fft_plan *plan)
void my_column_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
void my_column_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
void my_fft_swap13back(fft_plan *plan, fft_real *data, fft_real *out)
void my_fft_swap13(fft_plan *plan, fft_real *data, fft_real *out)
void my_slab_transposeB(fft_plan *plan, fft_real *field, fft_real *scratch)
void my_slab_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
void my_fft_swap23(fft_plan *plan, fft_real *data, fft_real *out)
void my_fft_swap23back(fft_plan *plan, fft_real *data, fft_real *out)
void my_slab_transposeA(fft_plan *plan, fft_real *field, fft_real *scratch)
void my_slab_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
int ThisTask
Definition: setcomm.h:33
int NTask
Definition: setcomm.h:32
int PTask
Definition: setcomm.h:34
MPI_Comm Communicator
Definition: setcomm.h:31
#define MPI_MESSAGE_SIZELIMIT_IN_BYTES
Definition: constants.h:53
#define Terminate(...)
Definition: macros.h:15
#define TAG_DENS_A
Definition: mpi_utils.h:50
int myMPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
Definition: myalltoall.cc:301
int myMPI_Sendrecv(void *sendbuf, size_t sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf, size_t recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status)
void myMPI_Alltoallv(void *sendbuf, size_t *sendcounts, size_t *sdispls, void *recvbuf, size_t *recvcounts, size_t *rdispls, int len, int big_flag, MPI_Comm comm)
Definition: myalltoall.cc:181
#define TAG_DENS_B
Definition: mpi_utils.h:51
memory Mem
Definition: main.cc:44
#define FFTW(x)
Definition: pm_mpi_fft.h:22
void subdivide_evenly(long long N, int pieces, int index_bin, long long *first, int *count)
Definition: system.cc:51