GADGET-4
pm_periodic.cc
Go to the documentation of this file.
1/*******************************************************************************
2 * \copyright This file is part of the GADGET4 N-body/SPH code developed
3 * \copyright by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
4 * \copyright (vspringel@mpa-garching.mpg.de) and all contributing authors.
5 *******************************************************************************/
6
12#include "gadgetconfig.h"
13
14#if defined(PMGRID) && defined(PERIODIC)
15
16#include <fftw3.h>
17#include <math.h>
18#include <mpi.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include <sys/stat.h>
23#include <algorithm>
24
25#include "../data/allvars.h"
26#include "../data/dtypes.h"
27#include "../data/intposconvert.h"
28#include "../data/mymalloc.h"
29#include "../domain/domain.h"
30#include "../logs/timer.h"
31#include "../main/simulation.h"
32#include "../mpi_utils/mpi_utils.h"
33#include "../pm/pm.h"
34#include "../pm/pm_periodic.h"
35#include "../sort/cxxsort.h"
36#include "../system/system.h"
37#include "../time_integration/timestep.h"
38
69#define GRIDz (GRIDZ / 2 + 1)
70#define GRID2 (2 * GRIDz)
71
72/* short-cut macros for accessing different 3D arrays */
73#define FI(x, y, z) (((large_array_offset)GRID2) * (GRIDY * (x) + (y)) + (z))
74#define FCxy(c, z) (((large_array_offset)GRID2) * ((c)-myplan.firstcol_XY) + (z))
75#define FCxz(c, y) (((large_array_offset)GRIDY) * ((c)-myplan.firstcol_XZ) + (y))
76#define FCzy(c, x) (((large_array_offset)GRIDX) * ((c)-myplan.firstcol_ZY) + (x))
77
78#ifndef FFT_COLUMN_BASED
79#define NI(x, y, z) (((large_array_offset)GRIDZ) * ((y) + (x)*myplan.nslab_y) + (z))
80#endif
81
86void pm_periodic::pm_init_periodic(simparticles *Sp_ptr)
87{
88 Sp = Sp_ptr;
89
90 Sp->Asmth[0] = ASMTH * All.BoxSize / PMGRID;
91 Sp->Rcut[0] = RCUT * Sp->Asmth[0];
92
93 /* Set up the FFTW-3 plan files. */
94 int ndimx[1] = {GRIDX}; /* dimension of the 1D transforms */
95 int ndimy[1] = {GRIDY}; /* dimension of the 1D transforms */
96 int ndimz[1] = {GRIDZ}; /* dimension of the 1D transforms */
97
98 int max_GRID2 = 2 * (std::max<int>(std::max<int>(GRIDX, GRIDY), GRIDZ) / 2 + 1);
99
100 /* temporarily allocate some arrays to make sure that out-of-place plans are created */
101 rhogrid = (fft_real *)Mem.mymalloc("rhogrid", max_GRID2 * sizeof(fft_real));
102 forcegrid = (fft_real *)Mem.mymalloc("forcegrid", max_GRID2 * sizeof(fft_real));
103
104#ifdef DOUBLEPRECISION_FFTW
105 int alignflag = 0;
106#else
107 /* for single precision, the start of our FFT columns is presently only guaranteed to be 8-byte aligned */
108 int alignflag = FFTW_UNALIGNED;
109#endif
110
111 myplan.forward_plan_zdir = FFTW(plan_many_dft_r2c)(1, ndimz, 1, rhogrid, 0, 1, GRID2, (fft_complex *)forcegrid, 0, 1, GRIDz,
112 FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
113
114#ifndef FFT_COLUMN_BASED
115 int stride = GRIDz;
116#else
117 int stride = 1;
118#endif
119
120 myplan.forward_plan_ydir =
121 FFTW(plan_many_dft)(1, ndimy, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDY, (fft_complex *)forcegrid, 0, stride,
122 GRIDz * GRIDY, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
123
124 myplan.forward_plan_xdir =
125 FFTW(plan_many_dft)(1, ndimx, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDX, (fft_complex *)forcegrid, 0, stride,
126 GRIDz * GRIDX, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
127
128 myplan.backward_plan_xdir =
129 FFTW(plan_many_dft)(1, ndimx, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDX, (fft_complex *)forcegrid, 0, stride,
130 GRIDz * GRIDX, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
131
132 myplan.backward_plan_ydir =
133 FFTW(plan_many_dft)(1, ndimy, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRIDY, (fft_complex *)forcegrid, 0, stride,
134 GRIDz * GRIDY, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
135
136 myplan.backward_plan_zdir = FFTW(plan_many_dft_c2r)(1, ndimz, 1, (fft_complex *)rhogrid, 0, 1, GRIDz, forcegrid, 0, 1, GRID2,
137 FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
138
139 Mem.myfree(forcegrid);
140 Mem.myfree(rhogrid);
141
142#ifndef FFT_COLUMN_BASED
143
144 my_slab_based_fft_init(&myplan, GRIDX, GRIDY, GRIDZ);
145
146 maxfftsize = std::max<int>(myplan.largest_x_slab * GRIDY, myplan.largest_y_slab * GRIDX) * ((size_t)GRID2);
147
148#else
149
150 my_column_based_fft_init(&myplan, GRIDX, GRIDY, GRIDZ);
151
152 maxfftsize = myplan.max_datasize;
153
154#endif
155
156#if defined(GRAVITY_TALLBOX)
157 kernel = (fft_real *)Mem.mymalloc("kernel", maxfftsize * sizeof(fft_real));
158 fft_of_kernel = (fft_complex *)kernel;
159
160 pmforce_setup_tallbox_kernel();
161#endif
162}
163
164/* Below, the two functions
165 *
166 * pmforce_ ...... _prepare_density()
167 * and
168 * pmforce_ ...... _readout_forces_or_potential(int dim)
169 *
170 * are defined in two different versions, one that works better for uniform
171 * simulations, the other for zoom-in runs. Only one of the two sets is used,
172 * depending on the setting of PM_ZOOM_OPTIMIZED.
173 */
174
175#ifdef PM_ZOOM_OPTIMIZED
176
177void pm_periodic::pmforce_zoom_optimized_prepare_density(int mode, int *typelist)
178{
179 int level, recvTask;
180 MPI_Status status;
181
182 particle_data *P = Sp->P;
183
184 part = (part_slab_data *)Mem.mymalloc("part", 8 * (NSource * sizeof(part_slab_data)));
185 large_numpart_type *part_sortindex =
186 (large_numpart_type *)Mem.mymalloc("part_sortindex", 8 * (NSource * sizeof(large_numpart_type)));
187
188#ifdef FFT_COLUMN_BASED
189 int columns = GRIDX * GRIDY;
190 int avg = (columns - 1) / NTask + 1;
191 int exc = NTask * avg - columns;
192 int tasklastsection = NTask - exc;
193 int pivotcol = tasklastsection * avg;
194#endif
195
196 /* determine the cells each particle accesses */
197 for(int idx = 0; idx < NSource; idx++)
198 {
199 int i = Sp->get_active_index(idx);
200
201 if(P[i].Ti_Current != All.Ti_Current)
202 Sp->drift_particle(&P[i], &Sp->SphP[i], All.Ti_Current);
203
204 int slab_x, slab_y, slab_z;
205 if(mode == 2)
206 {
207 slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
208 slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
209 slab_z = (P[i].IntPos[2] * POWERSPEC_FOLDFAC) / INTCELL;
210 }
211 else if(mode == 3)
212 {
213 slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
214 slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
215 slab_z = (P[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
216 }
217 else
218 {
219 slab_x = P[i].IntPos[0] / INTCELL;
220 slab_y = P[i].IntPos[1] / INTCELL;
221 slab_z = P[i].IntPos[2] / INTCELL;
222 }
223
224 large_numpart_type index_on_grid = ((large_numpart_type)idx) << 3;
225
226 for(int xx = 0; xx < 2; xx++)
227 for(int yy = 0; yy < 2; yy++)
228 for(int zz = 0; zz < 2; zz++)
229 {
230 int slab_xx = slab_x + xx;
231 int slab_yy = slab_y + yy;
232 int slab_zz = slab_z + zz;
233
234 if(slab_xx >= GRIDX)
235 slab_xx = 0;
236 if(slab_yy >= GRIDY)
237 slab_yy = 0;
238 if(slab_zz >= GRIDZ)
239 slab_zz = 0;
240
241 large_array_offset offset = FI(slab_xx, slab_yy, slab_zz);
242
243 part[index_on_grid].partindex = (i << 3) + (xx << 2) + (yy << 1) + zz;
244 part[index_on_grid].globalindex = offset;
245 part_sortindex[index_on_grid] = index_on_grid;
246 index_on_grid++;
247 }
248 }
249
250 /* note: num_on_grid will be 8 times larger than the particle number, but num_field_points will generally be much smaller */
251
252 large_array_offset num_field_points;
253 large_numpart_type num_on_grid = ((large_numpart_type)NSource) << 3;
254
255 /* bring the part-field into the order of the accessed cells. This allows the removal of duplicates */
256
257 mycxxsort(part_sortindex, part_sortindex + num_on_grid, pm_periodic_sortindex_comparator(part));
258
259 if(num_on_grid > 0)
260 num_field_points = 1;
261 else
262 num_field_points = 0;
263
264 /* determine the number of unique field points */
265 for(large_numpart_type i = 1; i < num_on_grid; i++)
266 {
267 if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
268 num_field_points++;
269 }
270
271 /* allocate the local field */
272 localfield_globalindex = (large_array_offset *)Mem.mymalloc_movable(&localfield_globalindex, "localfield_globalindex",
273 num_field_points * sizeof(large_array_offset));
274 localfield_data = (fft_real *)Mem.mymalloc_movable(&localfield_data, "localfield_data", num_field_points * sizeof(fft_real));
275 localfield_first = (size_t *)Mem.mymalloc_movable(&localfield_first, "localfield_first", NTask * sizeof(size_t));
276 localfield_sendcount = (size_t *)Mem.mymalloc_movable(&localfield_sendcount, "localfield_sendcount", NTask * sizeof(size_t));
277 localfield_offset = (size_t *)Mem.mymalloc_movable(&localfield_offset, "localfield_offset", NTask * sizeof(size_t));
278 localfield_recvcount = (size_t *)Mem.mymalloc_movable(&localfield_recvcount, "localfield_recvcount", NTask * sizeof(size_t));
279
280 for(int i = 0; i < NTask; i++)
281 {
282 localfield_first[i] = 0;
283 localfield_sendcount[i] = 0;
284 }
285
286 /* establish the cross link between the part[ ]-array and the local list of
287 * mesh points. Also, count on which CPU the needed field points are stored.
288 */
289 num_field_points = 0;
290 for(large_numpart_type i = 0; i < num_on_grid; i++)
291 {
292 if(i > 0)
293 if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
294 num_field_points++;
295
296 part[part_sortindex[i]].localindex = num_field_points;
297
298 if(i > 0)
299 if(part[part_sortindex[i]].globalindex == part[part_sortindex[i - 1]].globalindex)
300 continue;
301
302 localfield_globalindex[num_field_points] = part[part_sortindex[i]].globalindex;
303
304#ifndef FFT_COLUMN_BASED
305 int slab = part[part_sortindex[i]].globalindex / (GRIDY * GRID2);
306 int task = myplan.slab_to_task[slab];
307#else
308 int task, column = part[part_sortindex[i]].globalindex / (GRID2);
309
310 if(column < pivotcol)
311 task = column / avg;
312 else
313 task = (column - pivotcol) / (avg - 1) + tasklastsection;
314#endif
315
316 if(localfield_sendcount[task] == 0)
317 localfield_first[task] = num_field_points;
318
319 localfield_sendcount[task]++;
320 }
321 num_field_points++;
322
323 localfield_offset[0] = 0;
324 for(int i = 1; i < NTask; i++)
325 localfield_offset[i] = localfield_offset[i - 1] + localfield_sendcount[i - 1];
326
327 Mem.myfree_movable(part_sortindex);
328 part_sortindex = NULL;
329
330 /* now bin the local particle data onto the mesh list */
331 for(large_numpart_type i = 0; i < num_field_points; i++)
332 localfield_data[i] = 0;
333
334 for(large_numpart_type i = 0; i < num_on_grid; i += 8)
335 {
336 int pindex = (part[i].partindex >> 3);
337 MyIntPosType rmd_x, rmd_y, rmd_z;
338
339 if(mode == 2)
340 {
341 rmd_x = (P[pindex].IntPos[0] * POWERSPEC_FOLDFAC) % INTCELL;
342 rmd_y = (P[pindex].IntPos[1] * POWERSPEC_FOLDFAC) % INTCELL;
343 rmd_z = (P[pindex].IntPos[2] * POWERSPEC_FOLDFAC) % INTCELL;
344 }
345 else if(mode == 3)
346 {
347 rmd_x = (P[pindex].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
348 rmd_y = (P[pindex].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
349 rmd_z = (P[pindex].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
350 }
351 else
352 {
353 rmd_x = P[pindex].IntPos[0] % INTCELL;
354 rmd_y = P[pindex].IntPos[1] % INTCELL;
355 rmd_z = P[pindex].IntPos[2] % INTCELL;
356 }
357
358 double dx = rmd_x * (1.0 / INTCELL);
359 double dy = rmd_y * (1.0 / INTCELL);
360 double dz = rmd_z * (1.0 / INTCELL);
361
362 double weight = P[pindex].getMass();
363
364 if(mode) /* only for power spectrum calculation */
365 if(typelist[P[pindex].getType()] == 0)
366 continue;
367
368 localfield_data[part[i + 0].localindex] += weight * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
369 localfield_data[part[i + 1].localindex] += weight * (1.0 - dx) * (1.0 - dy) * dz;
370 localfield_data[part[i + 2].localindex] += weight * (1.0 - dx) * dy * (1.0 - dz);
371 localfield_data[part[i + 3].localindex] += weight * (1.0 - dx) * dy * dz;
372 localfield_data[part[i + 4].localindex] += weight * (dx) * (1.0 - dy) * (1.0 - dz);
373 localfield_data[part[i + 5].localindex] += weight * (dx) * (1.0 - dy) * dz;
374 localfield_data[part[i + 6].localindex] += weight * (dx)*dy * (1.0 - dz);
375 localfield_data[part[i + 7].localindex] += weight * (dx)*dy * dz;
376 }
377
378 rhogrid = (fft_real *)Mem.mymalloc_clear("rhogrid", maxfftsize * sizeof(fft_real));
379
380 /* exchange data and add contributions to the local mesh-path */
381 myMPI_Alltoall(localfield_sendcount, sizeof(size_t), MPI_BYTE, localfield_recvcount, sizeof(size_t), MPI_BYTE, Communicator);
382
383 for(level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
384 {
385 recvTask = ThisTask ^ level;
386
387 if(recvTask < NTask)
388 {
389 if(level > 0)
390 {
391 import_data = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
392 import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
393 localfield_recvcount[recvTask] * sizeof(large_array_offset));
394
395 if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
396 {
397 myMPI_Sendrecv(localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
398 MPI_BYTE, recvTask, TAG_NONPERIOD_A, import_data, localfield_recvcount[recvTask] * sizeof(fft_real),
399 MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
400
401 myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
402 localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_B,
403 import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
404 TAG_NONPERIOD_B, Communicator, &status);
405 }
406 }
407 else
408 {
409 import_data = localfield_data + localfield_offset[ThisTask];
410 import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
411 }
412
413 /* note: here every element in rhogrid is only accessed once, so there should be no race condition */
414 for(size_t i = 0; i < localfield_recvcount[recvTask]; i++)
415 {
416 /* determine offset in local FFT slab */
417#ifndef FFT_COLUMN_BASED
418 large_array_offset offset =
419 import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRIDY * ((large_array_offset)GRID2);
420#else
421 large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
422#endif
423 rhogrid[offset] += import_data[i];
424 }
425
426 if(level > 0)
427 {
428 Mem.myfree(import_globalindex);
429 Mem.myfree(import_data);
430 }
431 }
432 }
433}
434
435/* Function to read out the force component corresponding to spatial dimension 'dim'.
436 * If dim is negative, potential values are read out and assigned to particles.
437 */
438void pm_periodic::pmforce_zoom_optimized_readout_forces_or_potential(fft_real *grid, int dim)
439{
440 particle_data *P = Sp->P;
441
442#ifdef EVALPOTENTIAL
443#ifdef GRAVITY_TALLBOX
444 double fac = 1.0 / (((double)GRIDX) * GRIDY * GRIDZ); /* to get potential */
445#else
446 double fac = 4.0 * M_PI * (LONG_X * LONG_Y * LONG_Z) / pow(All.BoxSize, 3); /* to get potential */
447#endif
448#endif
449
450 for(int level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
451 {
452 int recvTask = ThisTask ^ level;
453
454 if(recvTask < NTask)
455 {
456 if(level > 0)
457 {
458 import_data = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
459 import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
460 localfield_recvcount[recvTask] * sizeof(large_array_offset));
461
462 if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
463 {
464 MPI_Status status;
465 myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
466 localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_C,
467 import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
468 TAG_NONPERIOD_C, Communicator, &status);
469 }
470 }
471 else
472 {
473 import_data = localfield_data + localfield_offset[ThisTask];
474 import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
475 }
476
477 for(size_t i = 0; i < localfield_recvcount[recvTask]; i++)
478 {
479#ifndef FFT_COLUMN_BASED
480 large_array_offset offset =
481 import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRIDY * ((large_array_offset)GRID2);
482#else
483 large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
484#endif
485 import_data[i] = grid[offset];
486 }
487
488 if(level > 0)
489 {
490 MPI_Status status;
491 myMPI_Sendrecv(import_data, localfield_recvcount[recvTask] * sizeof(fft_real), MPI_BYTE, recvTask, TAG_NONPERIOD_A,
492 localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
493 MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
494
495 Mem.myfree(import_globalindex);
496 Mem.myfree(import_data);
497 }
498 }
499 }
500
501 /* read out the force/potential values, which all have been assembled in localfield_data */
502 for(int idx = 0; idx < NSource; idx++)
503 {
504 int i = Sp->get_active_index(idx);
505
506#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
507 if(!Sp->TimeBinSynchronized[P[i].TimeBinGrav])
508 continue;
509#endif
510
511 large_numpart_type j = (idx << 3);
512
513 MyIntPosType rmd_x = P[i].IntPos[0] % INTCELL;
514 MyIntPosType rmd_y = P[i].IntPos[1] % INTCELL;
515 MyIntPosType rmd_z = P[i].IntPos[2] % INTCELL;
516
517 double dx = rmd_x * (1.0 / INTCELL);
518 double dy = rmd_y * (1.0 / INTCELL);
519 double dz = rmd_z * (1.0 / INTCELL);
520
521 double value = localfield_data[part[j + 0].localindex] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
522 localfield_data[part[j + 1].localindex] * (1.0 - dx) * (1.0 - dy) * dz +
523 localfield_data[part[j + 2].localindex] * (1.0 - dx) * dy * (1.0 - dz) +
524 localfield_data[part[j + 3].localindex] * (1.0 - dx) * dy * dz +
525 localfield_data[part[j + 4].localindex] * (dx) * (1.0 - dy) * (1.0 - dz) +
526 localfield_data[part[j + 5].localindex] * (dx) * (1.0 - dy) * dz +
527 localfield_data[part[j + 6].localindex] * (dx)*dy * (1.0 - dz) +
528 localfield_data[part[j + 7].localindex] * (dx)*dy * dz;
529
530 if(dim < 0)
531 {
532#ifdef EVALPOTENTIAL
533#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
534 P[i].PM_Potential += value * fac;
535#else
536 P[i].Potential += value * fac;
537#endif
538#endif
539 }
540 else
541 {
542#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
543 Sp->P[i].GravPM[dim] += value;
544#else
545 Sp->P[i].GravAccel[dim] += value;
546#endif
547 }
548 }
549}
550
551#else
552
553/*
554 * Here come the routines for a different communication algorithm that is better suited for a homogeneously loaded boxes.
555 */
556
557void pm_periodic::pmforce_uniform_optimized_prepare_density(int mode, int *typelist)
558{
559 Sndpm_count = (size_t *)Mem.mymalloc("Sndpm_count", NTask * sizeof(size_t));
560 Sndpm_offset = (size_t *)Mem.mymalloc("Sndpm_offset", NTask * sizeof(size_t));
561 Rcvpm_count = (size_t *)Mem.mymalloc("Rcvpm_count", NTask * sizeof(size_t));
562 Rcvpm_offset = (size_t *)Mem.mymalloc("Rcvpm_offset", NTask * sizeof(size_t));
563
564 particle_data *P = Sp->P;
565
566 /* determine the slabs/columns each particles accesses */
567
568#ifdef FFT_COLUMN_BASED
569 int columns = GRIDX * GRIDY;
570 int avg = (columns - 1) / NTask + 1;
571 int exc = NTask * avg - columns;
572 int tasklastsection = NTask - exc;
573 int pivotcol = tasklastsection * avg;
574#endif
575
576 for(int rep = 0; rep < 2; rep++)
577 {
578 /* each threads needs to do the loop to clear its send_count[] array */
579 for(int j = 0; j < NTask; j++)
580 Sndpm_count[j] = 0;
581
582 for(int idx = 0; idx < NSource; idx++)
583 {
584 int i = Sp->get_active_index(idx);
585
586 if(P[i].Ti_Current != All.Ti_Current)
587 Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
588
589 if(mode) /* only for power spectrum calculation */
590 if(typelist[P[i].getType()] == 0)
591 continue;
592
593 int slab_x;
594 if(mode == 2)
595 slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
596 else if(mode == 3)
597 slab_x = (P[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
598 else
599 slab_x = P[i].IntPos[0] / INTCELL;
600
601 int slab_xx = slab_x + 1;
602
603 if(slab_xx >= GRIDX)
604 slab_xx = 0;
605
606#ifndef FFT_COLUMN_BASED
607 if(rep == 0)
608 {
609 int task0 = myplan.slab_to_task[slab_x];
610 int task1 = myplan.slab_to_task[slab_xx];
611
612 Sndpm_count[task0]++;
613
614 if(task0 != task1)
615 Sndpm_count[task1]++;
616 }
617 else
618 {
619 int task0 = myplan.slab_to_task[slab_x];
620 int task1 = myplan.slab_to_task[slab_xx];
621
622 size_t ind0 = Sndpm_offset[task0] + Sndpm_count[task0]++;
623#ifndef LEAN
624 partout[ind0].Mass = P[i].getMass();
625#endif
626 for(int j = 0; j < 3; j++)
627 partout[ind0].IntPos[j] = P[i].IntPos[j];
628
629 if(task0 != task1)
630 {
631 size_t ind1 = Sndpm_offset[task1] + Sndpm_count[task1]++;
632#ifndef LEAN
633 partout[ind1].Mass = P[i].getMass();
634#endif
635 for(int j = 0; j < 3; j++)
636 partout[ind1].IntPos[j] = P[i].IntPos[j];
637 }
638 }
639
640#else
641 int slab_y;
642 if(mode == 2)
643 slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
644 else if(mode == 3)
645 slab_y = (P[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
646 else
647 slab_y = P[i].IntPos[1] / INTCELL;
648
649 int slab_yy = slab_y + 1;
650
651 if(slab_yy >= GRIDY)
652 slab_yy = 0;
653
654 int column0 = slab_x * GRIDY + slab_y;
655 int column1 = slab_x * GRIDY + slab_yy;
656 int column2 = slab_xx * GRIDY + slab_y;
657 int column3 = slab_xx * GRIDY + slab_yy;
658
659 int task0, task1, task2, task3;
660
661 if(column0 < pivotcol)
662 task0 = column0 / avg;
663 else
664 task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
665
666 if(column1 < pivotcol)
667 task1 = column1 / avg;
668 else
669 task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
670
671 if(column2 < pivotcol)
672 task2 = column2 / avg;
673 else
674 task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
675
676 if(column3 < pivotcol)
677 task3 = column3 / avg;
678 else
679 task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
680
681 if(rep == 0)
682 {
683 Sndpm_count[task0]++;
684 if(task1 != task0)
685 Sndpm_count[task1]++;
686 if(task2 != task1 && task2 != task0)
687 Sndpm_count[task2]++;
688 if(task3 != task0 && task3 != task1 && task3 != task2)
689 Sndpm_count[task3]++;
690 }
691 else
692 {
693 size_t ind0 = Sndpm_offset[task0] + Sndpm_count[task0]++;
694#ifndef LEAN
695 partout[ind0].Mass = P[i].getMass();
696#endif
697 for(int j = 0; j < 3; j++)
698 partout[ind0].IntPos[j] = P[i].IntPos[j];
699
700 if(task1 != task0)
701 {
702 size_t ind1 = Sndpm_offset[task1] + Sndpm_count[task1]++;
703#ifndef LEAN
704 partout[ind1].Mass = P[i].getMass();
705#endif
706 for(int j = 0; j < 3; j++)
707 partout[ind1].IntPos[j] = P[i].IntPos[j];
708 }
709 if(task2 != task1 && task2 != task0)
710 {
711 size_t ind2 = Sndpm_offset[task2] + Sndpm_count[task2]++;
712#ifndef LEAN
713 partout[ind2].Mass = P[i].getMass();
714#endif
715 for(int j = 0; j < 3; j++)
716 partout[ind2].IntPos[j] = P[i].IntPos[j];
717 }
718 if(task3 != task0 && task3 != task1 && task3 != task2)
719 {
720 size_t ind3 = Sndpm_offset[task3] + Sndpm_count[task3]++;
721#ifndef LEAN
722 partout[ind3].Mass = P[i].getMass();
723#endif
724 for(int j = 0; j < 3; j++)
725 partout[ind3].IntPos[j] = P[i].IntPos[j];
726 }
727 }
728#endif
729 }
730
731 if(rep == 0)
732 {
733 myMPI_Alltoall(Sndpm_count, sizeof(size_t), MPI_BYTE, Rcvpm_count, sizeof(size_t), MPI_BYTE, Communicator);
734
735 nimport = 0, nexport = 0, Rcvpm_offset[0] = 0, Sndpm_offset[0] = 0;
736 for(int j = 0; j < NTask; j++)
737 {
738 nexport += Sndpm_count[j];
739 nimport += Rcvpm_count[j];
740
741 if(j > 0)
742 {
743 Sndpm_offset[j] = Sndpm_offset[j - 1] + Sndpm_count[j - 1];
744 Rcvpm_offset[j] = Rcvpm_offset[j - 1] + Rcvpm_count[j - 1];
745 }
746 }
747
748 /* allocate import and export buffer */
749 partin = (partbuf *)Mem.mymalloc_movable(&partin, "partin", nimport * sizeof(partbuf));
750 partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
751 }
752 }
753
754 /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
755 * transfer the data in chunks.
756 */
757 int flag_big = 0, flag_big_all;
758 for(int i = 0; i < NTask; i++)
759 if(Sndpm_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
760 flag_big = 1;
761
762 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
763
764 /* exchange particle data */
765 myMPI_Alltoallv(partout, Sndpm_count, Sndpm_offset, partin, Rcvpm_count, Rcvpm_offset, sizeof(partbuf), flag_big_all, Communicator);
766
767 Mem.myfree(partout);
768
769 /* allocate cleared density field */
770 rhogrid = (fft_real *)Mem.mymalloc_movable_clear(&rhogrid, "rhogrid", maxfftsize * sizeof(fft_real));
771
772#ifndef FFT_COLUMN_BASED
773 /* bin particle data onto mesh, in multi-threaded fashion */
774
775 for(size_t i = 0; i < nimport; i++)
776 {
777 int slab_x, slab_y, slab_z;
778 MyIntPosType rmd_x, rmd_y, rmd_z;
779
780 if(mode == 2)
781 {
782 slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
783 rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) % INTCELL;
784 slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
785 rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) % INTCELL;
786 slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) / INTCELL;
787 rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) % INTCELL;
788 }
789 else if(mode == 3)
790 {
791 slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
792 rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
793 slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
794 rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
795 slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
796 rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
797 }
798 else
799 {
800 slab_x = partin[i].IntPos[0] / INTCELL;
801 rmd_x = partin[i].IntPos[0] % INTCELL;
802 slab_y = partin[i].IntPos[1] / INTCELL;
803 rmd_y = partin[i].IntPos[1] % INTCELL;
804 slab_z = partin[i].IntPos[2] / INTCELL;
805 rmd_z = partin[i].IntPos[2] % INTCELL;
806 }
807
808 double dx = rmd_x * (1.0 / INTCELL);
809 double dy = rmd_y * (1.0 / INTCELL);
810 double dz = rmd_z * (1.0 / INTCELL);
811
812 int slab_xx = slab_x + 1;
813 int slab_yy = slab_y + 1;
814 int slab_zz = slab_z + 1;
815
816 if(slab_xx >= GRIDX)
817 slab_xx = 0;
818 if(slab_yy >= GRIDY)
819 slab_yy = 0;
820 if(slab_zz >= GRIDZ)
821 slab_zz = 0;
822
823#ifdef LEAN
824 double mass = All.PartMass;
825#else
826 double mass = partin[i].Mass;
827#endif
828
829 if(myplan.slab_to_task[slab_x] == ThisTask)
830 {
831 slab_x -= myplan.first_slab_x_of_task[ThisTask];
832
833 rhogrid[FI(slab_x, slab_y, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
834 rhogrid[FI(slab_x, slab_y, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
835
836 rhogrid[FI(slab_x, slab_yy, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
837 rhogrid[FI(slab_x, slab_yy, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
838 }
839
840 if(myplan.slab_to_task[slab_xx] == ThisTask)
841 {
842 slab_xx -= myplan.first_slab_x_of_task[ThisTask];
843
844 rhogrid[FI(slab_xx, slab_y, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
845 rhogrid[FI(slab_xx, slab_y, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
846
847 rhogrid[FI(slab_xx, slab_yy, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
848 rhogrid[FI(slab_xx, slab_yy, slab_zz)] += (mass * (dx) * (dy) * (dz));
849 }
850 }
851
852#else
853
854 int first_col = myplan.firstcol_XY;
855 int last_col = myplan.firstcol_XY + myplan.ncol_XY - 1;
856
857 for(size_t i = 0; i < nimport; i++)
858 {
859 int slab_x, slab_y;
860 MyIntPosType rmd_x, rmd_y;
861 if(mode == 2)
862 {
863 slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) / INTCELL;
864 rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC) % INTCELL;
865 slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) / INTCELL;
866 rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC) % INTCELL;
867 }
868 else if(mode == 3)
869 {
870 slab_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
871 rmd_x = (partin[i].IntPos[0] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
872 slab_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
873 rmd_y = (partin[i].IntPos[1] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
874 }
875 else
876 {
877 slab_x = partin[i].IntPos[0] / INTCELL;
878 rmd_x = partin[i].IntPos[0] % INTCELL;
879 slab_y = partin[i].IntPos[1] / INTCELL;
880 rmd_y = partin[i].IntPos[1] % INTCELL;
881 }
882
883 double dx = rmd_x * (1.0 / INTCELL);
884 double dy = rmd_y * (1.0 / INTCELL);
885
886 int slab_xx = slab_x + 1;
887 int slab_yy = slab_y + 1;
888
889 if(slab_xx >= GRIDX)
890 slab_xx = 0;
891
892 if(slab_yy >= GRIDY)
893 slab_yy = 0;
894
895 int col0 = slab_x * GRIDY + slab_y;
896 int col1 = slab_x * GRIDY + slab_yy;
897 int col2 = slab_xx * GRIDY + slab_y;
898 int col3 = slab_xx * GRIDY + slab_yy;
899
900#ifdef LEAN
901 double mass = All.PartMass;
902#else
903 double mass = partin[i].Mass;
904#endif
905
906 int slab_z;
907 MyIntPosType rmd_z;
908 if(mode == 2)
909 {
910 slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) / INTCELL;
911 rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC) % INTCELL;
912 }
913 else if(mode == 3)
914 {
915 slab_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) / INTCELL;
916 rmd_z = (partin[i].IntPos[2] * POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC) % INTCELL;
917 }
918 else
919 {
920 slab_z = partin[i].IntPos[2] / INTCELL;
921 rmd_z = partin[i].IntPos[2] % INTCELL;
922 }
923
924 double dz = rmd_z * (1.0 / INTCELL);
925
926 int slab_zz = slab_z + 1;
927
928 if(slab_zz >= GRIDZ)
929 slab_zz = 0;
930
931 if(col0 >= first_col && col0 <= last_col)
932 {
933 rhogrid[FCxy(col0, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
934 rhogrid[FCxy(col0, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
935 }
936
937 if(col1 >= first_col && col1 <= last_col)
938 {
939 rhogrid[FCxy(col1, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
940 rhogrid[FCxy(col1, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
941 }
942
943 if(col2 >= first_col && col2 <= last_col)
944 {
945 rhogrid[FCxy(col2, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
946 rhogrid[FCxy(col2, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
947 }
948
949 if(col3 >= first_col && col3 <= last_col)
950 {
951 rhogrid[FCxy(col3, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
952 rhogrid[FCxy(col3, slab_zz)] += (mass * (dx) * (dy) * (dz));
953 }
954 }
955
956#endif
957}
958
959/* If dim<0, this function reads out the potential, otherwise Cartesian force components.
960 */
961void pm_periodic::pmforce_uniform_optimized_readout_forces_or_potential_xy(fft_real *grid, int dim)
962{
963 particle_data *P = Sp->P;
964
965#ifdef EVALPOTENTIAL
966#ifdef GRAVITY_TALLBOX
967 double fac = 1.0 / (((double)GRIDX) * GRIDY * GRIDZ); /* to get potential */
968#else
969 double fac = 4 * M_PI * (LONG_X * LONG_Y * LONG_Z) / pow(All.BoxSize, 3); /* to get potential */
970#endif
971#endif
972
973 MyFloat *flistin = (MyFloat *)Mem.mymalloc("flistin", nimport * sizeof(MyFloat));
974 MyFloat *flistout = (MyFloat *)Mem.mymalloc("flistout", nexport * sizeof(MyFloat));
975
976#ifdef FFT_COLUMN_BASED
977 int columns = GRIDX * GRIDY;
978 int avg = (columns - 1) / NTask + 1;
979 int exc = NTask * avg - columns;
980 int tasklastsection = NTask - exc;
981 int pivotcol = tasklastsection * avg;
982#endif
983
984 for(size_t i = 0; i < nimport; i++)
985 {
986 flistin[i] = 0;
987
988 int slab_x = partin[i].IntPos[0] / INTCELL;
989 int slab_y = partin[i].IntPos[1] / INTCELL;
990 int slab_z = partin[i].IntPos[2] / INTCELL;
991
992 MyIntPosType rmd_x = partin[i].IntPos[0] % INTCELL;
993 MyIntPosType rmd_y = partin[i].IntPos[1] % INTCELL;
994 MyIntPosType rmd_z = partin[i].IntPos[2] % INTCELL;
995
996 double dx = rmd_x * (1.0 / INTCELL);
997 double dy = rmd_y * (1.0 / INTCELL);
998 double dz = rmd_z * (1.0 / INTCELL);
999
1000 int slab_xx = slab_x + 1;
1001 int slab_yy = slab_y + 1;
1002 int slab_zz = slab_z + 1;
1003
1004 if(slab_xx >= GRIDX)
1005 slab_xx = 0;
1006 if(slab_yy >= GRIDY)
1007 slab_yy = 0;
1008 if(slab_zz >= GRIDZ)
1009 slab_zz = 0;
1010
1011#ifndef FFT_COLUMN_BASED
1012 if(myplan.slab_to_task[slab_x] == ThisTask)
1013 {
1014 slab_x -= myplan.first_slab_x_of_task[ThisTask];
1015
1016 flistin[i] += grid[FI(slab_x, slab_y, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1017 grid[FI(slab_x, slab_y, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz) +
1018 grid[FI(slab_x, slab_yy, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) +
1019 grid[FI(slab_x, slab_yy, slab_zz)] * (1.0 - dx) * (dy) * (dz);
1020 }
1021
1022 if(myplan.slab_to_task[slab_xx] == ThisTask)
1023 {
1024 slab_xx -= myplan.first_slab_x_of_task[ThisTask];
1025
1026 flistin[i] += grid[FI(slab_xx, slab_y, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) +
1027 grid[FI(slab_xx, slab_y, slab_zz)] * (dx) * (1.0 - dy) * (dz) +
1028 grid[FI(slab_xx, slab_yy, slab_z)] * (dx) * (dy) * (1.0 - dz) +
1029 grid[FI(slab_xx, slab_yy, slab_zz)] * (dx) * (dy) * (dz);
1030 }
1031#else
1032 int column0 = slab_x * GRIDY + slab_y;
1033 int column1 = slab_x * GRIDY + slab_yy;
1034 int column2 = slab_xx * GRIDY + slab_y;
1035 int column3 = slab_xx * GRIDY + slab_yy;
1036
1037 if(column0 >= myplan.firstcol_XY && column0 <= myplan.lastcol_XY)
1038 {
1039 flistin[i] += grid[FCxy(column0, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1040 grid[FCxy(column0, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz);
1041 }
1042 if(column1 >= myplan.firstcol_XY && column1 <= myplan.lastcol_XY)
1043 {
1044 flistin[i] +=
1045 grid[FCxy(column1, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) + grid[FCxy(column1, slab_zz)] * (1.0 - dx) * (dy) * (dz);
1046 }
1047
1048 if(column2 >= myplan.firstcol_XY && column2 <= myplan.lastcol_XY)
1049 {
1050 flistin[i] +=
1051 grid[FCxy(column2, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) + grid[FCxy(column2, slab_zz)] * (dx) * (1.0 - dy) * (dz);
1052 }
1053
1054 if(column3 >= myplan.firstcol_XY && column3 <= myplan.lastcol_XY)
1055 {
1056 flistin[i] += grid[FCxy(column3, slab_z)] * (dx) * (dy) * (1.0 - dz) + grid[FCxy(column3, slab_zz)] * (dx) * (dy) * (dz);
1057 }
1058#endif
1059 }
1060
1061 /* exchange the potential component data */
1062 int flag_big = 0, flag_big_all;
1063 for(int i = 0; i < NTask; i++)
1064 if(Sndpm_count[i] * sizeof(MyFloat) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1065 flag_big = 1;
1066
1067 /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1068 * transfer the data in chunks.
1069 */
1070 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1071
1072 /* exchange data */
1073 myMPI_Alltoallv(flistin, Rcvpm_count, Rcvpm_offset, flistout, Sndpm_count, Sndpm_offset, sizeof(MyFloat), flag_big_all,
1074 Communicator);
1075
1076 /* each threads needs to do the loop to clear its send_count[] array */
1077 for(int j = 0; j < NTask; j++)
1078 Sndpm_count[j] = 0;
1079
1080 for(int idx = 0; idx < NSource; idx++)
1081 {
1082 int i = Sp->get_active_index(idx);
1083
1084 int slab_x = P[i].IntPos[0] / INTCELL;
1085 int slab_xx = slab_x + 1;
1086
1087 if(slab_xx >= GRIDX)
1088 slab_xx = 0;
1089
1090#ifndef FFT_COLUMN_BASED
1091 int task0 = myplan.slab_to_task[slab_x];
1092 int task1 = myplan.slab_to_task[slab_xx];
1093
1094 double value = flistout[Sndpm_offset[task0] + Sndpm_count[task0]++];
1095
1096 if(task0 != task1)
1097 value += flistout[Sndpm_offset[task1] + Sndpm_count[task1]++];
1098#else
1099 int slab_y = P[i].IntPos[1] / INTCELL;
1100 int slab_yy = slab_y + 1;
1101
1102 if(slab_yy >= GRIDY)
1103 slab_yy = 0;
1104
1105 int column0 = slab_x * GRIDY + slab_y;
1106 int column1 = slab_x * GRIDY + slab_yy;
1107 int column2 = slab_xx * GRIDY + slab_y;
1108 int column3 = slab_xx * GRIDY + slab_yy;
1109
1110 int task0, task1, task2, task3;
1111
1112 if(column0 < pivotcol)
1113 task0 = column0 / avg;
1114 else
1115 task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1116
1117 if(column1 < pivotcol)
1118 task1 = column1 / avg;
1119 else
1120 task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1121
1122 if(column2 < pivotcol)
1123 task2 = column2 / avg;
1124 else
1125 task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1126
1127 if(column3 < pivotcol)
1128 task3 = column3 / avg;
1129 else
1130 task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1131
1132 double value = flistout[Sndpm_offset[task0] + Sndpm_count[task0]++];
1133
1134 if(task1 != task0)
1135 value += flistout[Sndpm_offset[task1] + Sndpm_count[task1]++];
1136
1137 if(task2 != task1 && task2 != task0)
1138 value += flistout[Sndpm_offset[task2] + Sndpm_count[task2]++];
1139
1140 if(task3 != task0 && task3 != task1 && task3 != task2)
1141 value += flistout[Sndpm_offset[task3] + Sndpm_count[task3]++];
1142#endif
1143
1144#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
1145 if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
1146 continue;
1147#endif
1148
1149 if(dim < 0)
1150 {
1151#ifdef EVALPOTENTIAL
1152#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
1153 Sp->P[i].PM_Potential += value * fac;
1154#else
1155 Sp->P[i].Potential += value * fac;
1156#endif
1157#endif
1158 }
1159 else
1160 {
1161#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
1162 Sp->P[i].GravPM[dim] += value;
1163#else
1164 Sp->P[i].GravAccel[dim] += value;
1165#endif
1166 }
1167 }
1168
1169 Mem.myfree(flistout);
1170 Mem.myfree(flistin);
1171}
1172
1173#ifdef FFT_COLUMN_BASED
1174void pm_periodic::pmforce_uniform_optimized_readout_forces_or_potential_xz(fft_real *grid, int dim)
1175{
1176 if(dim != 1)
1177 Terminate("bummer");
1178
1179 size_t *send_count = (size_t *)Mem.mymalloc("send_count", NTask * sizeof(size_t));
1180 size_t *send_offset = (size_t *)Mem.mymalloc("send_offset", NTask * sizeof(size_t));
1181 size_t *recv_count = (size_t *)Mem.mymalloc("recv_count", NTask * sizeof(size_t));
1182 size_t *recv_offset = (size_t *)Mem.mymalloc("recv_offset", NTask * sizeof(size_t));
1183
1184 struct partbuf
1185 {
1186 MyIntPosType IntPos[3];
1187 };
1188
1189 partbuf *partin, *partout;
1190 size_t nimport = 0, nexport = 0;
1191
1192 particle_data *P = Sp->P;
1193
1194 int columns = GRIDX * GRID2;
1195 int avg = (columns - 1) / NTask + 1;
1196 int exc = NTask * avg - columns;
1197 int tasklastsection = NTask - exc;
1198 int pivotcol = tasklastsection * avg;
1199
1200 /* determine the slabs/columns each particles accesses */
1201 for(int rep = 0; rep < 2; rep++)
1202 {
1203 for(int j = 0; j < NTask; j++)
1204 send_count[j] = 0;
1205
1206 for(int idx = 0; idx < NSource; idx++)
1207 {
1208 int i = Sp->get_active_index(idx);
1209
1210 if(P[i].Ti_Current != All.Ti_Current)
1211 Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
1212
1213 int slab_x = P[i].IntPos[0] / INTCELL;
1214 int slab_xx = slab_x + 1;
1215
1216 if(slab_xx >= GRIDX)
1217 slab_xx = 0;
1218
1219 int slab_z = P[i].IntPos[2] / INTCELL;
1220 int slab_zz = slab_z + 1;
1221
1222 if(slab_zz >= GRIDZ)
1223 slab_zz = 0;
1224
1225 int column0 = slab_x * GRID2 + slab_z;
1226 int column1 = slab_x * GRID2 + slab_zz;
1227 int column2 = slab_xx * GRID2 + slab_z;
1228 int column3 = slab_xx * GRID2 + slab_zz;
1229
1230 int task0, task1, task2, task3;
1231
1232 if(column0 < pivotcol)
1233 task0 = column0 / avg;
1234 else
1235 task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1236
1237 if(column1 < pivotcol)
1238 task1 = column1 / avg;
1239 else
1240 task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1241
1242 if(column2 < pivotcol)
1243 task2 = column2 / avg;
1244 else
1245 task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1246
1247 if(column3 < pivotcol)
1248 task3 = column3 / avg;
1249 else
1250 task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1251
1252 if(rep == 0)
1253 {
1254 send_count[task0]++;
1255 if(task1 != task0)
1256 send_count[task1]++;
1257 if(task2 != task1 && task2 != task0)
1258 send_count[task2]++;
1259 if(task3 != task0 && task3 != task1 && task3 != task2)
1260 send_count[task3]++;
1261 }
1262 else
1263 {
1264 size_t ind0 = send_offset[task0] + send_count[task0]++;
1265 for(int j = 0; j < 3; j++)
1266 partout[ind0].IntPos[j] = P[i].IntPos[j];
1267
1268 if(task1 != task0)
1269 {
1270 size_t ind1 = send_offset[task1] + send_count[task1]++;
1271 for(int j = 0; j < 3; j++)
1272 partout[ind1].IntPos[j] = P[i].IntPos[j];
1273 }
1274 if(task2 != task1 && task2 != task0)
1275 {
1276 size_t ind2 = send_offset[task2] + send_count[task2]++;
1277
1278 for(int j = 0; j < 3; j++)
1279 partout[ind2].IntPos[j] = P[i].IntPos[j];
1280 }
1281 if(task3 != task0 && task3 != task1 && task3 != task2)
1282 {
1283 size_t ind3 = send_offset[task3] + send_count[task3]++;
1284
1285 for(int j = 0; j < 3; j++)
1286 partout[ind3].IntPos[j] = P[i].IntPos[j];
1287 }
1288 }
1289 }
1290
1291 if(rep == 0)
1292 {
1293 myMPI_Alltoall(send_count, sizeof(size_t), MPI_BYTE, recv_count, sizeof(size_t), MPI_BYTE, Communicator);
1294
1295 nimport = 0, nexport = 0;
1296 recv_offset[0] = send_offset[0] = 0;
1297
1298 for(int j = 0; j < NTask; j++)
1299 {
1300 nexport += send_count[j];
1301 nimport += recv_count[j];
1302
1303 if(j > 0)
1304 {
1305 send_offset[j] = send_offset[j - 1] + send_count[j - 1];
1306 recv_offset[j] = recv_offset[j - 1] + recv_count[j - 1];
1307 }
1308 }
1309
1310 /* allocate import and export buffer */
1311 partin = (partbuf *)Mem.mymalloc("partin", nimport * sizeof(partbuf));
1312 partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
1313 }
1314 }
1315
1316 /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1317 * transfer the data in chunks.
1318 */
1319 int flag_big = 0, flag_big_all;
1320 for(int i = 0; i < NTask; i++)
1321 if(send_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1322 flag_big = 1;
1323
1324 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1325
1326 /* exchange particle data */
1327 myMPI_Alltoallv(partout, send_count, send_offset, partin, recv_count, recv_offset, sizeof(partbuf), flag_big_all, Communicator);
1328
1329 Mem.myfree(partout);
1330
1331 MyFloat *flistin = (MyFloat *)Mem.mymalloc("flistin", nimport * sizeof(MyFloat));
1332 MyFloat *flistout = (MyFloat *)Mem.mymalloc("flistout", nexport * sizeof(MyFloat));
1333
1334 for(size_t i = 0; i < nimport; i++)
1335 {
1336 flistin[i] = 0;
1337
1338 int slab_x = partin[i].IntPos[0] / INTCELL;
1339 int slab_y = partin[i].IntPos[1] / INTCELL;
1340 int slab_z = partin[i].IntPos[2] / INTCELL;
1341
1342 MyIntPosType rmd_x = partin[i].IntPos[0] % INTCELL;
1343 MyIntPosType rmd_y = partin[i].IntPos[1] % INTCELL;
1344 MyIntPosType rmd_z = partin[i].IntPos[2] % INTCELL;
1345
1346 double dx = rmd_x * (1.0 / INTCELL);
1347 double dy = rmd_y * (1.0 / INTCELL);
1348 double dz = rmd_z * (1.0 / INTCELL);
1349
1350 int slab_xx = slab_x + 1;
1351 int slab_yy = slab_y + 1;
1352 int slab_zz = slab_z + 1;
1353
1354 if(slab_xx >= GRIDX)
1355 slab_xx = 0;
1356 if(slab_yy >= GRIDY)
1357 slab_yy = 0;
1358 if(slab_zz >= GRIDZ)
1359 slab_zz = 0;
1360
1361 int column0 = slab_x * GRID2 + slab_z;
1362 int column1 = slab_x * GRID2 + slab_zz;
1363 int column2 = slab_xx * GRID2 + slab_z;
1364 int column3 = slab_xx * GRID2 + slab_zz;
1365
1366 if(column0 >= myplan.firstcol_XZ && column0 <= myplan.lastcol_XZ)
1367 {
1368 flistin[i] += grid[FCxz(column0, slab_y)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1369 grid[FCxz(column0, slab_yy)] * (1.0 - dx) * (dy) * (1.0 - dz);
1370 }
1371 if(column1 >= myplan.firstcol_XZ && column1 <= myplan.lastcol_XZ)
1372 {
1373 flistin[i] +=
1374 grid[FCxz(column1, slab_y)] * (1.0 - dx) * (1.0 - dy) * (dz) + grid[FCxz(column1, slab_yy)] * (1.0 - dx) * (dy) * (dz);
1375 }
1376
1377 if(column2 >= myplan.firstcol_XZ && column2 <= myplan.lastcol_XZ)
1378 {
1379 flistin[i] +=
1380 grid[FCxz(column2, slab_y)] * (dx) * (1.0 - dy) * (1.0 - dz) + grid[FCxz(column2, slab_yy)] * (dx) * (dy) * (1.0 - dz);
1381 }
1382
1383 if(column3 >= myplan.firstcol_XZ && column3 <= myplan.lastcol_XZ)
1384 {
1385 flistin[i] += grid[FCxz(column3, slab_y)] * (dx) * (1.0 - dy) * (dz) + grid[FCxz(column3, slab_yy)] * (dx) * (dy) * (dz);
1386 }
1387 }
1388
1389 /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1390 * transfer the data in chunks.
1391 */
1392 flag_big = 0;
1393 for(int i = 0; i < NTask; i++)
1394 if(send_count[i] * sizeof(MyFloat) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1395 flag_big = 1;
1396
1397 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1398
1399 /* exchange data */
1400 myMPI_Alltoallv(flistin, recv_count, recv_offset, flistout, send_count, send_offset, sizeof(MyFloat), flag_big_all, Communicator);
1401
1402 for(int j = 0; j < NTask; j++)
1403 send_count[j] = 0;
1404
1405 /* now assign to original points */
1406 for(int idx = 0; idx < NSource; idx++)
1407 {
1408 int i = Sp->get_active_index(idx);
1409
1410 int slab_x = P[i].IntPos[0] / INTCELL;
1411 int slab_xx = slab_x + 1;
1412
1413 if(slab_xx >= GRIDX)
1414 slab_xx = 0;
1415
1416 int slab_z = P[i].IntPos[2] / INTCELL;
1417 int slab_zz = slab_z + 1;
1418
1419 if(slab_zz >= GRIDZ)
1420 slab_zz = 0;
1421
1422 int column0 = slab_x * GRID2 + slab_z;
1423 int column1 = slab_x * GRID2 + slab_zz;
1424 int column2 = slab_xx * GRID2 + slab_z;
1425 int column3 = slab_xx * GRID2 + slab_zz;
1426
1427 int task0, task1, task2, task3;
1428
1429 if(column0 < pivotcol)
1430 task0 = column0 / avg;
1431 else
1432 task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1433
1434 if(column1 < pivotcol)
1435 task1 = column1 / avg;
1436 else
1437 task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1438
1439 if(column2 < pivotcol)
1440 task2 = column2 / avg;
1441 else
1442 task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1443
1444 if(column3 < pivotcol)
1445 task3 = column3 / avg;
1446 else
1447 task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1448
1449 double value = flistout[send_offset[task0] + send_count[task0]++];
1450
1451 if(task1 != task0)
1452 value += flistout[send_offset[task1] + send_count[task1]++];
1453
1454 if(task2 != task1 && task2 != task0)
1455 value += flistout[send_offset[task2] + send_count[task2]++];
1456
1457 if(task3 != task0 && task3 != task1 && task3 != task2)
1458 value += flistout[send_offset[task3] + send_count[task3]++];
1459
1460#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
1461 if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
1462 continue;
1463#endif
1464
1465#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
1466 Sp->P[i].GravPM[dim] += value;
1467#else
1468 Sp->P[i].GravAccel[dim] += value;
1469#endif
1470 }
1471
1472 Mem.myfree(flistout);
1473 Mem.myfree(flistin);
1474 Mem.myfree(partin);
1475 Mem.myfree(recv_offset);
1476 Mem.myfree(recv_count);
1477 Mem.myfree(send_offset);
1478 Mem.myfree(send_count);
1479}
1480
1481void pm_periodic::pmforce_uniform_optimized_readout_forces_or_potential_zy(fft_real *grid, int dim)
1482{
1483 if(dim != 0)
1484 Terminate("bummer");
1485
1486 size_t *send_count = (size_t *)Mem.mymalloc("send_count", NTask * sizeof(size_t));
1487 size_t *send_offset = (size_t *)Mem.mymalloc("send_offset", NTask * sizeof(size_t));
1488 size_t *recv_count = (size_t *)Mem.mymalloc("recv_count", NTask * sizeof(size_t));
1489 size_t *recv_offset = (size_t *)Mem.mymalloc("recv_offset", NTask * sizeof(size_t));
1490
1491 struct partbuf
1492 {
1493 MyIntPosType IntPos[3];
1494 };
1495
1496 partbuf *partin, *partout;
1497 size_t nimport = 0, nexport = 0;
1498
1499 particle_data *P = Sp->P;
1500
1501 int columns = GRID2 * GRIDY;
1502 int avg = (columns - 1) / NTask + 1;
1503 int exc = NTask * avg - columns;
1504 int tasklastsection = NTask - exc;
1505 int pivotcol = tasklastsection * avg;
1506
1507 /* determine the slabs/columns each particles accesses */
1508 for(int rep = 0; rep < 2; rep++)
1509 {
1510 for(int j = 0; j < NTask; j++)
1511 send_count[j] = 0;
1512
1513 for(int idx = 0; idx < NSource; idx++)
1514 {
1515 int i = Sp->get_active_index(idx);
1516
1517 if(P[i].Ti_Current != All.Ti_Current)
1518 Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
1519
1520 int slab_z = P[i].IntPos[2] / INTCELL;
1521 int slab_zz = slab_z + 1;
1522
1523 if(slab_zz >= GRIDZ)
1524 slab_zz = 0;
1525
1526 int slab_y = P[i].IntPos[1] / INTCELL;
1527 int slab_yy = slab_y + 1;
1528
1529 if(slab_yy >= GRIDY)
1530 slab_yy = 0;
1531
1532 int column0 = slab_z * GRIDY + slab_y;
1533 int column1 = slab_z * GRIDY + slab_yy;
1534 int column2 = slab_zz * GRIDY + slab_y;
1535 int column3 = slab_zz * GRIDY + slab_yy;
1536
1537 int task0, task1, task2, task3;
1538
1539 if(column0 < pivotcol)
1540 task0 = column0 / avg;
1541 else
1542 task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1543
1544 if(column1 < pivotcol)
1545 task1 = column1 / avg;
1546 else
1547 task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1548
1549 if(column2 < pivotcol)
1550 task2 = column2 / avg;
1551 else
1552 task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1553
1554 if(column3 < pivotcol)
1555 task3 = column3 / avg;
1556 else
1557 task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1558
1559 if(rep == 0)
1560 {
1561 send_count[task0]++;
1562 if(task1 != task0)
1563 send_count[task1]++;
1564 if(task2 != task1 && task2 != task0)
1565 send_count[task2]++;
1566 if(task3 != task0 && task3 != task1 && task3 != task2)
1567 send_count[task3]++;
1568 }
1569 else
1570 {
1571 size_t ind0 = send_offset[task0] + send_count[task0]++;
1572 for(int j = 0; j < 3; j++)
1573 partout[ind0].IntPos[j] = P[i].IntPos[j];
1574
1575 if(task1 != task0)
1576 {
1577 size_t ind1 = send_offset[task1] + send_count[task1]++;
1578 for(int j = 0; j < 3; j++)
1579 partout[ind1].IntPos[j] = P[i].IntPos[j];
1580 }
1581 if(task2 != task1 && task2 != task0)
1582 {
1583 size_t ind2 = send_offset[task2] + send_count[task2]++;
1584
1585 for(int j = 0; j < 3; j++)
1586 partout[ind2].IntPos[j] = P[i].IntPos[j];
1587 }
1588 if(task3 != task0 && task3 != task1 && task3 != task2)
1589 {
1590 size_t ind3 = send_offset[task3] + send_count[task3]++;
1591
1592 for(int j = 0; j < 3; j++)
1593 partout[ind3].IntPos[j] = P[i].IntPos[j];
1594 }
1595 }
1596 }
1597
1598 if(rep == 0)
1599 {
1600 myMPI_Alltoall(send_count, sizeof(size_t), MPI_BYTE, recv_count, sizeof(size_t), MPI_BYTE, Communicator);
1601
1602 nimport = 0, nexport = 0;
1603 recv_offset[0] = send_offset[0] = 0;
1604
1605 for(int j = 0; j < NTask; j++)
1606 {
1607 nexport += send_count[j];
1608 nimport += recv_count[j];
1609
1610 if(j > 0)
1611 {
1612 send_offset[j] = send_offset[j - 1] + send_count[j - 1];
1613 recv_offset[j] = recv_offset[j - 1] + recv_count[j - 1];
1614 }
1615 }
1616
1617 /* allocate import and export buffer */
1618 partin = (partbuf *)Mem.mymalloc("partin", nimport * sizeof(partbuf));
1619 partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
1620 }
1621 }
1622
1623 /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1624 * transfer the data in chunks.
1625 */
1626 int flag_big = 0, flag_big_all;
1627 for(int i = 0; i < NTask; i++)
1628 if(send_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1629 flag_big = 1;
1630
1631 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1632
1633 /* exchange particle data */
1634 myMPI_Alltoallv(partout, send_count, send_offset, partin, recv_count, recv_offset, sizeof(partbuf), flag_big_all, Communicator);
1635
1636 Mem.myfree(partout);
1637
1638 MyFloat *flistin = (MyFloat *)Mem.mymalloc("flistin", nimport * sizeof(MyFloat));
1639 MyFloat *flistout = (MyFloat *)Mem.mymalloc("flistout", nexport * sizeof(MyFloat));
1640
1641 for(size_t i = 0; i < nimport; i++)
1642 {
1643 flistin[i] = 0;
1644
1645 int slab_x = partin[i].IntPos[0] / INTCELL;
1646 int slab_y = partin[i].IntPos[1] / INTCELL;
1647 int slab_z = partin[i].IntPos[2] / INTCELL;
1648
1649 MyIntPosType rmd_x = partin[i].IntPos[0] % INTCELL;
1650 MyIntPosType rmd_y = partin[i].IntPos[1] % INTCELL;
1651 MyIntPosType rmd_z = partin[i].IntPos[2] % INTCELL;
1652
1653 double dx = rmd_x * (1.0 / INTCELL);
1654 double dy = rmd_y * (1.0 / INTCELL);
1655 double dz = rmd_z * (1.0 / INTCELL);
1656
1657 int slab_xx = slab_x + 1;
1658 int slab_yy = slab_y + 1;
1659 int slab_zz = slab_z + 1;
1660
1661 if(slab_xx >= GRIDX)
1662 slab_xx = 0;
1663 if(slab_yy >= GRIDY)
1664 slab_yy = 0;
1665 if(slab_zz >= GRIDZ)
1666 slab_zz = 0;
1667
1668 int column0 = slab_z * GRIDY + slab_y;
1669 int column1 = slab_z * GRIDY + slab_yy;
1670 int column2 = slab_zz * GRIDY + slab_y;
1671 int column3 = slab_zz * GRIDY + slab_yy;
1672
1673 if(column0 >= myplan.firstcol_ZY && column0 <= myplan.lastcol_ZY)
1674 {
1675 flistin[i] += grid[FCzy(column0, slab_x)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1676 grid[FCzy(column0, slab_xx)] * (dx) * (1.0 - dy) * (1.0 - dz);
1677 }
1678 if(column1 >= myplan.firstcol_ZY && column1 <= myplan.lastcol_ZY)
1679 {
1680 flistin[i] +=
1681 grid[FCzy(column1, slab_x)] * (1.0 - dx) * (dy) * (1.0 - dz) + grid[FCzy(column1, slab_xx)] * (dx) * (dy) * (1.0 - dz);
1682 }
1683
1684 if(column2 >= myplan.firstcol_ZY && column2 <= myplan.lastcol_ZY)
1685 {
1686 flistin[i] +=
1687 grid[FCzy(column2, slab_x)] * (1.0 - dx) * (1.0 - dy) * (dz) + grid[FCzy(column2, slab_xx)] * (dx) * (1.0 - dy) * (dz);
1688 }
1689
1690 if(column3 >= myplan.firstcol_ZY && column3 <= myplan.lastcol_ZY)
1691 {
1692 flistin[i] += grid[FCzy(column3, slab_x)] * (1.0 - dx) * (dy) * (dz) + grid[FCzy(column3, slab_xx)] * (dx) * (dy) * (dz);
1693 }
1694 }
1695
1696 /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1697 * transfer the data in chunks.
1698 */
1699 flag_big = 0;
1700 for(int i = 0; i < NTask; i++)
1701 if(send_count[i] * sizeof(MyFloat) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1702 flag_big = 1;
1703
1704 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1705
1706 /* exchange data */
1707 myMPI_Alltoallv(flistin, recv_count, recv_offset, flistout, send_count, send_offset, sizeof(MyFloat), flag_big_all, Communicator);
1708
1709 for(int j = 0; j < NTask; j++)
1710 send_count[j] = 0;
1711
1712 /* now assign to original points */
1713 for(int idx = 0; idx < NSource; idx++)
1714 {
1715 int i = Sp->get_active_index(idx);
1716
1717 int slab_z = P[i].IntPos[2] / INTCELL;
1718 int slab_zz = slab_z + 1;
1719
1720 if(slab_zz >= GRIDZ)
1721 slab_zz = 0;
1722
1723 int slab_y = P[i].IntPos[1] / INTCELL;
1724 int slab_yy = slab_y + 1;
1725
1726 if(slab_yy >= GRIDY)
1727 slab_yy = 0;
1728
1729 int column0 = slab_z * GRIDY + slab_y;
1730 int column1 = slab_z * GRIDY + slab_yy;
1731 int column2 = slab_zz * GRIDY + slab_y;
1732 int column3 = slab_zz * GRIDY + slab_yy;
1733
1734 int task0, task1, task2, task3;
1735
1736 if(column0 < pivotcol)
1737 task0 = column0 / avg;
1738 else
1739 task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1740
1741 if(column1 < pivotcol)
1742 task1 = column1 / avg;
1743 else
1744 task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1745
1746 if(column2 < pivotcol)
1747 task2 = column2 / avg;
1748 else
1749 task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1750
1751 if(column3 < pivotcol)
1752 task3 = column3 / avg;
1753 else
1754 task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1755
1756 double value = flistout[send_offset[task0] + send_count[task0]++];
1757
1758 if(task1 != task0)
1759 value += flistout[send_offset[task1] + send_count[task1]++];
1760
1761 if(task2 != task1 && task2 != task0)
1762 value += flistout[send_offset[task2] + send_count[task2]++];
1763
1764 if(task3 != task0 && task3 != task1 && task3 != task2)
1765 value += flistout[send_offset[task3] + send_count[task3]++];
1766
1767#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
1768 if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
1769 continue;
1770#endif
1771
1772#if defined(PERIODIC) && !defined(TREEPM_NOTIMESPLIT)
1773 Sp->P[i].GravPM[dim] += value;
1774#else
1775 Sp->P[i].GravAccel[dim] += value;
1776#endif
1777 }
1778
1779 Mem.myfree(flistout);
1780 Mem.myfree(flistin);
1781 Mem.myfree(partin);
1782 Mem.myfree(recv_offset);
1783 Mem.myfree(recv_count);
1784 Mem.myfree(send_offset);
1785 Mem.myfree(send_count);
1786}
1787#endif
1788
1789#endif
1790
1803void pm_periodic::pmforce_periodic(int mode, int *typelist)
1804{
1805 int x, y, z;
1806
1807 double tstart = Logs.second();
1808
1809 if(mode == 0)
1810 mpi_printf("PM-PERIODIC: Starting periodic PM calculation. (Rcut=%g) presently allocated=%g MB\n", Sp->Rcut[0],
1812
1813#ifdef HIERARCHICAL_GRAVITY
1814 NSource = Sp->TimeBinsGravity.NActiveParticles;
1815#else
1816 NSource = Sp->NumPart;
1817#endif
1818
1819#ifndef TREEPM_NOTIMESPLIT
1820 if(NSource != Sp->NumPart)
1821 Terminate("unexpected NSource != Sp->NumPart");
1822#endif
1823
1824#ifndef NUMPART_PER_TASK_LARGE
1825 if((((long long)Sp->NumPart) << 3) >= (((long long)1) << 31))
1826 Terminate("We are dealing with a too large particle number per MPI rank - enabling NUMPART_PER_TASK_LARGE might help.");
1827#endif
1828
1829 double asmth2 = Sp->Asmth[0] * Sp->Asmth[0];
1830 double d = All.BoxSize / PMGRID;
1831 double dhalf = 0.5 * d;
1832
1833#ifdef GRAVITY_TALLBOX
1834 double fac = 1.0 / (((double)GRIDX) * GRIDY * GRIDZ); /* to get potential */
1835#else
1836 double fac = 4 * M_PI * (LONG_X * LONG_Y * LONG_Z) / pow(All.BoxSize, 3); /* to get potential */
1837#endif
1838
1839 fac *= 1 / (2 * d); /* for finite differencing */
1840
1841#ifdef PM_ZOOM_OPTIMIZED
1842 pmforce_zoom_optimized_prepare_density(mode, typelist);
1843#else
1844 pmforce_uniform_optimized_prepare_density(mode, typelist);
1845#endif
1846
1847 /* note: after density, we still keep the field 'partin' from the density assignment,
1848 * as we can use this later on to return potential and z-force
1849 */
1850
1851 /* allocate the memory to hold the FFT fields */
1852
1853 forcegrid = (fft_real *)Mem.mymalloc_movable(&forcegrid, "forcegrid", maxfftsize * sizeof(fft_real));
1854
1855 workspace = forcegrid;
1856
1857#ifndef FFT_COLUMN_BASED
1858 fft_of_rhogrid = (fft_complex *)&rhogrid[0];
1859#else
1860 fft_of_rhogrid = (fft_complex *)&workspace[0];
1861#endif
1862
1863 /* Do the FFT of the density field */
1864#ifndef FFT_COLUMN_BASED
1865 my_slab_based_fft(&myplan, &rhogrid[0], &workspace[0], 1);
1866#else
1867 my_column_based_fft(&myplan, rhogrid, workspace, 1); /* result is in workspace, not in rhogrid ! */
1868#endif
1869
1870 if(mode != 0)
1871 {
1872 pmforce_measure_powerspec(mode - 1, typelist);
1873
1874#if defined(FFT_COLUMN_BASED) && !defined(PM_ZOOM_OPTIMIZED)
1875 Mem.myfree_movable(partin);
1876 partin = NULL;
1877#endif
1878 }
1879 else
1880 {
1881 /* multiply with Green's function in order to obtain the potential (or forces for spectral diffencing) */
1882
1883 double kfacx = 2.0 * M_PI / (GRIDX * d);
1884 double kfacy = 2.0 * M_PI / (GRIDY * d);
1885 double kfacz = 2.0 * M_PI / (GRIDZ * d);
1886
1887#ifdef FFT_COLUMN_BASED
1888 for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
1889 {
1890 large_array_offset ipcell = ip + ((large_array_offset)myplan.second_transposed_firstcol) * GRIDX;
1891 y = ipcell / (GRIDX * GRIDz);
1892 int yr = ipcell % (GRIDX * GRIDz);
1893 z = yr / GRIDX;
1894 x = yr % GRIDX;
1895#else
1896 for(x = 0; x < GRIDX; x++)
1897 for(y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
1898 for(z = 0; z < GRIDz; z++)
1899 {
1900#endif
1901 int xx, yy, zz;
1902
1903 if(x >= (GRIDX / 2))
1904 xx = x - GRIDX;
1905 else
1906 xx = x;
1907 if(y >= (GRIDY / 2))
1908 yy = y - GRIDY;
1909 else
1910 yy = y;
1911 if(z >= (GRIDZ / 2))
1912 zz = z - GRIDZ;
1913 else
1914 zz = z;
1915
1916 double kx = kfacx * xx;
1917 double ky = kfacy * yy;
1918 double kz = kfacz * zz;
1919
1920 double k2 = kx * kx + ky * ky + kz * kz;
1921
1922 double smth = 1.0, deconv = 1.0;
1923
1924 if(k2 > 0)
1925 {
1926 smth = -exp(-k2 * asmth2) / k2;
1927
1928 /* do deconvolution */
1929
1930 double fx = 1, fy = 1, fz = 1;
1931
1932 if(xx != 0)
1933 {
1934 fx = kx * dhalf;
1935 fx = sin(fx) / fx;
1936 }
1937 if(yy != 0)
1938 {
1939 fy = ky * dhalf;
1940 fy = sin(fy) / fy;
1941 }
1942 if(zz != 0)
1943 {
1944 fz = kz * dhalf;
1945 fz = sin(fz) / fz;
1946 }
1947
1948 double ff = 1 / (fx * fy * fz);
1949 deconv = ff * ff * ff * ff;
1950
1951 smth *= deconv; /* deconvolution */
1952 }
1953
1954#ifndef FFT_COLUMN_BASED
1955 large_array_offset ip = ((large_array_offset)GRIDz) * (GRIDX * (y - myplan.slabstart_y) + x) + z;
1956#endif
1957
1958#ifdef GRAVITY_TALLBOX
1959 double re = fft_of_rhogrid[ip][0] * fft_of_kernel[ip][0] - fft_of_rhogrid[ip][1] * fft_of_kernel[ip][1];
1960 double im = fft_of_rhogrid[ip][0] * fft_of_kernel[ip][1] + fft_of_rhogrid[ip][1] * fft_of_kernel[ip][0];
1961
1962 fft_of_rhogrid[ip][0] = re * deconv * exp(-k2 * asmth2);
1963 fft_of_rhogrid[ip][1] = im * deconv * exp(-k2 * asmth2);
1964#else
1965 fft_of_rhogrid[ip][0] *= smth;
1966 fft_of_rhogrid[ip][1] *= smth;
1967#endif
1968 }
1969
1970#ifndef GRAVITY_TALLBOX
1971#ifdef FFT_COLUMN_BASED
1972 if(myplan.second_transposed_firstcol == 0)
1973 fft_of_rhogrid[0][0] = fft_of_rhogrid[0][1] = 0.0;
1974#else
1975 if(myplan.slabstart_y == 0)
1976 fft_of_rhogrid[0][0] = fft_of_rhogrid[0][1] = 0.0;
1977#endif
1978#endif
1979
1980 /* Do the inverse FFT to get the potential/forces */
1981
1982#ifndef FFT_COLUMN_BASED
1983 my_slab_based_fft(&myplan, &rhogrid[0], &workspace[0], -1);
1984#else
1985 my_column_based_fft(&myplan, workspace, rhogrid, -1);
1986#endif
1987
1988 /* Now rhogrid holds the potential/forces */
1989
1990#ifdef EVALPOTENTIAL
1991#ifdef PM_ZOOM_OPTIMIZED
1992 pmforce_zoom_optimized_readout_forces_or_potential(rhogrid, -1);
1993#else
1994 pmforce_uniform_optimized_readout_forces_or_potential_xy(rhogrid, -1);
1995#endif
1996#endif
1997
1998 /* get the force components by finite differencing of the potential for each dimension,
1999 * and send the results back to the right CPUs
2000 */
2001
2002 /* we do the x component last, because for differencing the potential in the x-direction, we need to construct the
2003 * transpose
2004 */
2005
2006#ifndef FFT_COLUMN_BASED
2007
2008 /* z-direction */
2009 for(y = 0; y < GRIDY; y++)
2010 for(x = 0; x < myplan.nslab_x; x++)
2011 for(z = 0; z < GRIDZ; z++)
2012 {
2013 int zr = z + 1, zl = z - 1, zrr = z + 2, zll = z - 2;
2014 if(zr >= GRIDZ)
2015 zr -= GRIDZ;
2016 if(zrr >= GRIDZ)
2017 zrr -= GRIDZ;
2018 if(zl < 0)
2019 zl += GRIDZ;
2020 if(zll < 0)
2021 zll += GRIDZ;
2022
2023 forcegrid[FI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[FI(x, y, zl)] - rhogrid[FI(x, y, zr)]) -
2024 (1.0 / 6) * (rhogrid[FI(x, y, zll)] - rhogrid[FI(x, y, zrr)]));
2025 }
2026
2027#ifdef PM_ZOOM_OPTIMIZED
2028 pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 2);
2029#else
2030 pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 2);
2031#endif
2032
2033 /* y-direction */
2034 for(y = 0; y < GRIDY; y++)
2035 for(x = 0; x < myplan.nslab_x; x++)
2036 for(z = 0; z < GRIDZ; z++)
2037 {
2038 int yr = y + 1, yl = y - 1, yrr = y + 2, yll = y - 2;
2039 if(yr >= GRIDY)
2040 yr -= GRIDY;
2041 if(yrr >= GRIDY)
2042 yrr -= GRIDY;
2043 if(yl < 0)
2044 yl += GRIDY;
2045 if(yll < 0)
2046 yll += GRIDY;
2047
2048 forcegrid[FI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[FI(x, yl, z)] - rhogrid[FI(x, yr, z)]) -
2049 (1.0 / 6) * (rhogrid[FI(x, yll, z)] - rhogrid[FI(x, yrr, z)]));
2050 }
2051
2052#ifdef PM_ZOOM_OPTIMIZED
2053 pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 1);
2054#else
2055 pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 1);
2056#endif
2057
2058 /* x-direction */
2059 my_slab_transposeA(&myplan, rhogrid, forcegrid); /* compute the transpose of the potential field for finite differencing */
2060 /* note: for the x-direction, we difference the transposed field */
2061
2062 for(x = 0; x < GRIDX; x++)
2063 for(y = 0; y < myplan.nslab_y; y++)
2064 for(z = 0; z < GRIDZ; z++)
2065 {
2066 int xrr = x + 2, xll = x - 2, xr = x + 1, xl = x - 1;
2067 if(xr >= GRIDX)
2068 xr -= GRIDX;
2069 if(xrr >= GRIDX)
2070 xrr -= GRIDX;
2071 if(xl < 0)
2072 xl += GRIDX;
2073 if(xll < 0)
2074 xll += GRIDX;
2075
2076 forcegrid[NI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[NI(xl, y, z)] - rhogrid[NI(xr, y, z)]) -
2077 (1.0 / 6) * (rhogrid[NI(xll, y, z)] - rhogrid[NI(xrr, y, z)]));
2078 }
2079
2080 my_slab_transposeB(&myplan, forcegrid, rhogrid); /* reverse the transpose from above */
2081
2082#ifdef PM_ZOOM_OPTIMIZED
2083 pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 0);
2084#else
2085 pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 0);
2086#endif
2087
2088#else
2089
2090 /* z-direction */
2091 for(large_array_offset i = 0; i < myplan.ncol_XY; i++)
2092 {
2093 fft_real *forcep = &forcegrid[GRID2 * i];
2094 fft_real *potp = &rhogrid[GRID2 * i];
2095
2096 for(int z = 0; z < GRIDZ; z++)
2097 {
2098 int zr = z + 1;
2099 int zl = z - 1;
2100 int zrr = z + 2;
2101 int zll = z - 2;
2102
2103 if(zr >= GRIDZ)
2104 zr -= GRIDZ;
2105 if(zrr >= GRIDZ)
2106 zrr -= GRIDZ;
2107 if(zl < 0)
2108 zl += GRIDZ;
2109 if(zll < 0)
2110 zll += GRIDZ;
2111
2112 forcep[z] = fac * ((4.0 / 3) * (potp[zl] - potp[zr]) - (1.0 / 6) * (potp[zll] - potp[zrr]));
2113 }
2114 }
2115
2116#ifdef PM_ZOOM_OPTIMIZED
2117 pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 2);
2118#else
2119 pmforce_uniform_optimized_readout_forces_or_potential_xy(forcegrid, 2);
2120
2121 /* at this point we can free partin */
2122 Mem.myfree_movable(partin);
2123 partin = NULL;
2124#endif
2125
2126 /* y-direction */
2127 my_fft_swap23(&myplan, rhogrid, forcegrid); // rhogrid contains potential field, forcegrid the transposed field
2128
2129 /* make an in-place computation */
2130 fft_real *column = (fft_real *)Mem.mymalloc("column", GRIDY * sizeof(fft_real));
2131
2132 for(large_array_offset i = 0; i < myplan.ncol_XZ; i++)
2133 {
2134 memcpy(column, &forcegrid[GRIDY * i], GRIDY * sizeof(fft_real));
2135
2136 fft_real *potp = column;
2137 fft_real *forcep = &forcegrid[GRIDY * i];
2138
2139 for(int y = 0; y < GRIDY; y++)
2140 {
2141 int yr = y + 1;
2142 int yl = y - 1;
2143 int yrr = y + 2;
2144 int yll = y - 2;
2145
2146 if(yr >= GRIDY)
2147 yr -= GRIDY;
2148 if(yrr >= GRIDY)
2149 yrr -= GRIDY;
2150 if(yl < 0)
2151 yl += GRIDY;
2152 if(yll < 0)
2153 yll += GRIDY;
2154
2155 forcep[y] = fac * ((4.0 / 3) * (potp[yl] - potp[yr]) - (1.0 / 6) * (potp[yll] - potp[yrr]));
2156 }
2157 }
2158
2159 Mem.myfree(column);
2160
2161 /* now need to read out from forcegrid in a non-standard way */
2162
2163#ifdef PM_ZOOM_OPTIMIZED
2164 /* need a third field as scratch space */
2165 fft_real *scratch = (fft_real *)Mem.mymalloc("scratch", myplan.fftsize * sizeof(fft_real));
2166
2167 my_fft_swap23back(&myplan, forcegrid, scratch);
2168 pmforce_zoom_optimized_readout_forces_or_potential(scratch, 1);
2169
2170 Mem.myfree(scratch);
2171#else
2172 pmforce_uniform_optimized_readout_forces_or_potential_xz(forcegrid, 1);
2173#endif
2174
2175 /* x-direction */
2176 my_fft_swap13(&myplan, rhogrid, forcegrid); // rhogrid contains potential field
2177
2178 for(large_array_offset i = 0; i < myplan.ncol_ZY; i++)
2179 {
2180 fft_real *forcep = &rhogrid[GRIDX * i];
2181 fft_real *potp = &forcegrid[GRIDX * i];
2182
2183 for(int x = 0; x < GRIDX; x++)
2184 {
2185 int xr = x + 1;
2186 int xl = x - 1;
2187 int xrr = x + 2;
2188 int xll = x - 2;
2189
2190 if(xr >= GRIDX)
2191 xr -= GRIDX;
2192 if(xrr >= GRIDX)
2193 xrr -= GRIDX;
2194 if(xl < 0)
2195 xl += GRIDX;
2196 if(xll < 0)
2197 xll += GRIDX;
2198
2199 forcep[x] = fac * ((4.0 / 3) * (potp[xl] - potp[xr]) - (1.0 / 6) * (potp[xll] - potp[xrr]));
2200 }
2201 }
2202
2203 /* now need to read out from forcegrid in a non-standard way */
2204#ifdef PM_ZOOM_OPTIMIZED
2205 my_fft_swap13back(&myplan, rhogrid, forcegrid);
2206 pmforce_zoom_optimized_readout_forces_or_potential(forcegrid, 0);
2207#else
2208 pmforce_uniform_optimized_readout_forces_or_potential_zy(rhogrid, 0);
2209#endif
2210
2211#endif
2212 }
2213
2214 /* free stuff */
2215
2216 Mem.myfree(forcegrid);
2217 Mem.myfree(rhogrid);
2218
2219#ifdef PM_ZOOM_OPTIMIZED
2220 Mem.myfree(localfield_recvcount);
2221 Mem.myfree(localfield_offset);
2222 Mem.myfree(localfield_sendcount);
2223 Mem.myfree(localfield_first);
2224 Mem.myfree(localfield_data);
2225 Mem.myfree(localfield_globalindex);
2226 Mem.myfree(part);
2227#else
2228#ifndef FFT_COLUMN_BASED
2229 Mem.myfree(partin);
2230#endif
2231 Mem.myfree(Rcvpm_offset);
2232 Mem.myfree(Rcvpm_count);
2233 Mem.myfree(Sndpm_offset);
2234 Mem.myfree(Sndpm_count);
2235#endif
2236
2237 double tend = Logs.second();
2238
2239 if(mode == 0)
2240 mpi_printf("PM-PERIODIC: done. (took %g seconds)\n", Logs.timediff(tstart, tend));
2241}
2242
2243#ifdef GRAVITY_TALLBOX
2244
2248void pm_periodic::pmforce_setup_tallbox_kernel(void)
2249{
2250 double d = All.BoxSize / PMGRID;
2251
2252 mpi_printf("PM-PERIODIC: Setting up tallbox kernel (GRIDX=%d, GRIDY=%d, GRIDZ=%d)\n", GRIDX, GRIDY, GRIDZ);
2253
2254 /* now set up kernel and its Fourier transform */
2255
2256 for(int i = 0; i < maxfftsize; i++) /* clear local field */
2257 kernel[i] = 0;
2258
2259#ifndef FFT_COLUMN_BASED
2260 for(int i = myplan.slabstart_x; i < (myplan.slabstart_x + myplan.nslab_x); i++)
2261 for(int j = 0; j < GRIDY; j++)
2262 {
2263#else
2264 for(int c = myplan.firstcol_XY; c < (myplan.firstcol_XY + myplan.ncol_XY); c++)
2265 {
2266 int i = c / GRIDY;
2267 int j = c % GRIDY;
2268#endif
2269
2270 for(int k = 0; k < GRIDZ; k++)
2271 {
2272 int ii, jj, kk;
2273
2274 if(i >= (GRIDX / 2))
2275 ii = i - GRIDX;
2276 else
2277 ii = i;
2278 if(j >= (GRIDY / 2))
2279 jj = j - GRIDY;
2280 else
2281 jj = j;
2282 if(k >= (GRIDZ / 2))
2283 kk = k - GRIDZ;
2284 else
2285 kk = k;
2286
2287 double xx = ii * d;
2288 double yy = jj * d;
2289 double zz = kk * d;
2290
2291 double pot = pmperiodic_tallbox_long_range_potential(xx, yy, zz);
2292
2293#ifndef FFT_COLUMN_BASED
2294 size_t ip = FI(i - myplan.slabstart_x, j, k);
2295#else
2296 size_t ip = FCxy(c, k);
2297#endif
2298 kernel[ip] = pot / All.BoxSize;
2299 }
2300
2301#ifndef FFT_COLUMN_BASED
2302 }
2303#else
2304 }
2305#endif
2306
2307 /* Do the FFT of the kernel */
2308
2309 fft_real *workspc = (fft_real *)Mem.mymalloc("workspc", maxfftsize * sizeof(fft_real));
2310
2311#ifndef FFT_COLUMN_BASED
2312 my_slab_based_fft(&myplan, kernel, workspc, 1);
2313#else
2314 my_column_based_fft(&myplan, kernel, workspc, 1); /* result is in workspace, not in kernel */
2315 memcpy(kernel, workspc, maxfftsize * sizeof(fft_real));
2316#endif
2317
2318 Mem.myfree(workspc);
2319
2320 mpi_printf("PM-PERIODIC: Done setting up tallbox kernel\n");
2321}
2322
2323double pm_periodic::pmperiodic_tallbox_long_range_potential(double x, double y, double z)
2324{
2325 x /= All.BoxSize;
2326 y /= All.BoxSize;
2327 z /= All.BoxSize;
2328
2329 double r = sqrt(x * x + y * y + z * z);
2330
2331 double xx, yy, zz;
2332 switch(GRAVITY_TALLBOX)
2333 {
2334 case 0:
2335 xx = y;
2336 yy = z;
2337 zz = x;
2338 break;
2339 case 1:
2340 xx = x;
2341 yy = z;
2342 zz = y;
2343 break;
2344 case 2:
2345 xx = x;
2346 yy = y;
2347 zz = z;
2348 break;
2349 }
2350 x = xx;
2351 y = yy;
2352 z = zz;
2353
2354 /* the third dimension, z, is now the non-periodic one */
2355
2356 double leff = sqrt(BOXX * BOXY);
2357 double alpha = 2.0 / leff;
2358
2359 double sum1 = 0.0;
2360
2361 int qxmax = (int)(10.0 / (BOXX * alpha) + 0.5);
2362 int qymax = (int)(10.0 / (BOXY * alpha) + 0.5);
2363
2364 int nxmax = (int)(4.0 * alpha * BOXX + 0.5);
2365 int nymax = (int)(4.0 * alpha * BOXY + 0.5);
2366
2367 for(int nx = -qxmax; nx <= qxmax; nx++)
2368 for(int ny = -qymax; ny <= qymax; ny++)
2369 {
2370 if(nx != 0 || ny != 0)
2371 {
2372 double dx = x - nx * BOXX;
2373 double dy = y - ny * BOXY;
2374 double r = sqrt(dx * dx + dy * dy + z * z);
2375 if(r > 0)
2376 sum1 += erfc(alpha * r) / r;
2377 }
2378 else
2379 {
2380 // in the nx/ny=0 case, correct for the short range force
2381 double alpha_star = 0.5 / (((double)ASMTH) / PMGRID);
2382 double u = alpha_star * r;
2383 if(r > 0)
2384 sum1 += (erfc(alpha * r) - erfc(u)) / r;
2385 else
2386 sum1 += 2.0 / sqrt(M_PI) * (alpha_star - alpha);
2387 }
2388 }
2389
2390 double alpha2 = alpha * alpha;
2391
2392 double sum2 = 0.0;
2393
2394 for(int nx = -nxmax; nx <= nxmax; nx++)
2395 for(int ny = -nymax; ny <= nymax; ny++)
2396 {
2397 if(nx != 0 || ny != 0)
2398 {
2399 double kx = (2.0 * M_PI / BOXX) * nx;
2400 double ky = (2.0 * M_PI / BOXY) * ny;
2401 double k2 = kx * kx + ky * ky;
2402 double k = sqrt(k2);
2403
2404 if(k * z > 0)
2405 {
2406 double ex = exp(-k * z);
2407 if(ex > 0)
2408 sum2 += cos(kx * x + ky * y) * (erfc(k / (2 * alpha) + alpha * z) / ex + ex * erfc(k / (2 * alpha) - alpha * z)) / k;
2409 }
2410 else
2411 {
2412 double ex = exp(k * z);
2413 if(ex > 0)
2414 sum2 += cos(kx * x + ky * y) * (ex * erfc(k / (2 * alpha) + alpha * z) + erfc(k / (2 * alpha) - alpha * z) / ex) / k;
2415 }
2416 }
2417 }
2418
2419 sum2 *= M_PI / (BOXX * BOXY);
2420
2421 double psi = 2.0 * alpha / sqrt(M_PI) +
2422 (2 * sqrt(M_PI) / (BOXX * BOXY) * (exp(-alpha2 * z * z) / alpha + sqrt(M_PI) * z * erf(alpha * z))) - (sum1 + sum2);
2423
2424 return psi;
2425}
2426#endif
2427
2428/*----------------------------------------------------------------------------------------------------*/
2429/* Here comes code for the power-spectrum computation */
2430/*----------------------------------------------------------------------------------------------------*/
2431
2432void pm_periodic::calculate_power_spectra(int num)
2433{
2434 int n_type[NTYPES];
2435 long long ntot_type_all[NTYPES];
2436 /* determine global and local particle numbers */
2437 for(int n = 0; n < NTYPES; n++)
2438 n_type[n] = 0;
2439 for(int n = 0; n < Sp->NumPart; n++)
2440 n_type[Sp->P[n].getType()]++;
2441
2442 sumup_large_ints(NTYPES, n_type, ntot_type_all, Communicator);
2443
2444 int typeflag[NTYPES];
2445
2446 for(int i = 0; i < NTYPES; i++)
2447 typeflag[i] = 1;
2448
2449#ifdef HIERARCHICAL_GRAVITY
2450 int flag_extra_allocate = 0;
2451 if(Sp->TimeBinsGravity.ActiveParticleList == NULL)
2452 {
2453 flag_extra_allocate = 1;
2454 Sp->TimeBinsGravity.timebins_allocate();
2455 }
2456
2457 Sp->TimeBinsGravity.NActiveParticles = 0;
2458 for(int i = 0; i < Sp->NumPart; i++)
2459 Sp->TimeBinsGravity.ActiveParticleList[Sp->TimeBinsGravity.NActiveParticles++] = i;
2460#endif
2461
2462 if(ThisTask == 0)
2463 {
2464 char buf[MAXLEN_PATH_EXTRA];
2465 sprintf(buf, "%s/powerspecs", All.OutputDir);
2466 mkdir(buf, 02755);
2467 }
2468
2469 sprintf(power_spec_fname, "%s/powerspecs/powerspec_%03d.txt", All.OutputDir, num);
2470
2471 pmforce_do_powerspec(typeflag); /* calculate power spectrum for all particle types */
2472
2473 /* check whether whether more than one type is in use */
2474 int count_types = 0;
2475 for(int i = 0; i < NTYPES; i++)
2476 if(ntot_type_all[i] > 0)
2477 count_types++;
2478
2479 if(count_types > 1)
2480 for(int i = 0; i < NTYPES; i++)
2481 {
2482 if(ntot_type_all[i] > 0)
2483 {
2484 for(int j = 0; j < NTYPES; j++)
2485 typeflag[j] = 0;
2486
2487 typeflag[i] = 1;
2488
2489 sprintf(power_spec_fname, "%s/powerspecs/powerspec_type%d_%03d.txt", All.OutputDir, i, num);
2490
2491 pmforce_do_powerspec(typeflag); /* calculate power spectrum for type i */
2492 }
2493 }
2494
2495#ifdef HIERARCHICAL_GRAVITY
2496 if(flag_extra_allocate)
2497 Sp->TimeBinsGravity.timebins_free();
2498#endif
2499}
2500
2501void pm_periodic::pmforce_do_powerspec(int *typeflag)
2502{
2503 mpi_printf("POWERSPEC: Begin power spectrum. (typeflag=[");
2504 for(int i = 0; i < NTYPES; i++)
2505 mpi_printf(" %d ", typeflag[i]);
2506 mpi_printf("])\n");
2507
2508 double tstart = Logs.second();
2509
2510 pmforce_periodic(1, typeflag); /* calculate regular power spectrum for selected particle types */
2511
2512 pmforce_periodic(2, typeflag); /* calculate folded power spectrum for selected particle types */
2513
2514 pmforce_periodic(3, typeflag); /* calculate twice folded power spectrum for selected particle types */
2515
2516 double tend = Logs.second();
2517
2518 mpi_printf("POWERSPEC: End power spectrum. took %g seconds\n", Logs.timediff(tstart, tend));
2519}
2520
2521void pm_periodic::pmforce_measure_powerspec(int flag, int *typeflag)
2522{
2523 particle_data *P = Sp->P;
2524
2525 long long CountModes[BINS_PS];
2526 double SumPowerUncorrected[BINS_PS]; /* without binning correction (as for shot noise) */
2527 double PowerUncorrected[BINS_PS]; /* without binning correction */
2528 double DeltaUncorrected[BINS_PS]; /* without binning correction */
2529 double ShotLimit[BINS_PS];
2530 double KWeightSum[BINS_PS];
2531 double Kbin[BINS_PS];
2532
2533 double mass = 0, mass2 = 0, count = 0;
2534 for(int i = 0; i < Sp->NumPart; i++)
2535 if(typeflag[P[i].getType()])
2536 {
2537 mass += Sp->P[i].getMass();
2538 mass2 += Sp->P[i].getMass() * Sp->P[i].getMass();
2539 count += 1.0;
2540 }
2541
2542 MPI_Allreduce(MPI_IN_PLACE, &mass, 1, MPI_DOUBLE, MPI_SUM, Communicator);
2543 MPI_Allreduce(MPI_IN_PLACE, &mass2, 1, MPI_DOUBLE, MPI_SUM, Communicator);
2544 MPI_Allreduce(MPI_IN_PLACE, &count, 1, MPI_DOUBLE, MPI_SUM, Communicator);
2545
2546 double d = All.BoxSize / PMGRID;
2547 double dhalf = 0.5 * d;
2548
2549 double fac = 1.0 / mass;
2550
2551 double K0 = 2 * M_PI / All.BoxSize; /* minimum k */
2552 double K1 = 2 * M_PI / All.BoxSize * (POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC * PMGRID / 2); /* maximum k that can be measured */
2553 double binfac = BINS_PS / (log(K1) - log(K0));
2554
2555 double kfacx = 2.0 * M_PI * LONG_X / All.BoxSize;
2556 double kfacy = 2.0 * M_PI * LONG_Y / All.BoxSize;
2557 double kfacz = 2.0 * M_PI * LONG_Z / All.BoxSize;
2558
2559 for(int i = 0; i < BINS_PS; i++)
2560 {
2561 SumPowerUncorrected[i] = 0;
2562 CountModes[i] = 0;
2563 KWeightSum[i] = 0;
2564 }
2565
2566#ifdef FFT_COLUMN_BASED
2567 for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
2568 {
2569 large_array_offset ipcell = ip + ((large_array_offset)myplan.second_transposed_firstcol) * GRIDX;
2570 int y = ipcell / (GRIDX * GRIDz);
2571 int yr = ipcell % (GRIDX * GRIDz);
2572 int z = yr / GRIDX;
2573 int x = yr % GRIDX;
2574#else
2575 for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
2576 for(int x = 0; x < GRIDX; x++)
2577 for(int z = 0; z < GRIDz; z++)
2578 {
2579#endif
2580 int count_double;
2581
2582 if(z >= 1 &&
2583 z < (GRIDZ + 1) / 2) /* these modes need to be counted twice due to the storage scheme for the FFT of a real field */
2584 count_double = 1;
2585 else
2586 count_double = 0;
2587
2588 int xx, yy, zz;
2589
2590 if(x >= (GRIDX / 2))
2591 xx = x - GRIDX;
2592 else
2593 xx = x;
2594
2595 if(y >= (GRIDY / 2))
2596 yy = y - GRIDY;
2597 else
2598 yy = y;
2599
2600 if(z >= (GRIDZ / 2))
2601 zz = z - GRIDZ;
2602 else
2603 zz = z;
2604
2605 double kx = kfacx * xx;
2606 double ky = kfacy * yy;
2607 double kz = kfacz * zz;
2608
2609 double k2 = kx * kx + ky * ky + kz * kz;
2610
2611 if(k2 > 0)
2612 {
2613 /* do deconvolution */
2614 double fx = 1, fy = 1, fz = 1;
2615
2616 if(xx != 0)
2617 {
2618 fx = kx * dhalf;
2619 fx = sin(fx) / fx;
2620 }
2621 if(yy != 0)
2622 {
2623 fy = ky * dhalf;
2624 fy = sin(fy) / fy;
2625 }
2626 if(zz != 0)
2627 {
2628 fz = kz * dhalf;
2629 fz = sin(fz) / fz;
2630 }
2631 double ff = 1 / (fx * fy * fz);
2632 double smth = ff * ff * ff * ff;
2633 /*
2634 * Note: The Fourier-transform of the density field (rho_hat) must be multiplied with ff^2
2635 * in order to do the de-convolution. Thats why po = rho_hat^2 gains a factor of ff^4.
2636 */
2637 /* end deconvolution */
2638
2639#ifndef FFT_COLUMN_BASED
2640 large_array_offset ip = ((large_array_offset)GRIDz) * (GRIDX * (y - myplan.slabstart_y) + x) + z;
2641#endif
2642
2643 double po = (fft_of_rhogrid[ip][0] * fft_of_rhogrid[ip][0] + fft_of_rhogrid[ip][1] * fft_of_rhogrid[ip][1]);
2644
2645 po *= fac * fac * smth;
2646
2647 double k = sqrt(k2);
2648
2649 if(flag == 1)
2650 k *= POWERSPEC_FOLDFAC;
2651 else if(flag == 2)
2652 k *= POWERSPEC_FOLDFAC * POWERSPEC_FOLDFAC;
2653
2654 if(k >= K0 && k < K1)
2655 {
2656 int bin = log(k / K0) * binfac;
2657
2658 SumPowerUncorrected[bin] += po;
2659 CountModes[bin] += 1;
2660 KWeightSum[bin] += log(k);
2661
2662 if(count_double)
2663 {
2664 SumPowerUncorrected[bin] += po;
2665 CountModes[bin] += 1;
2666 KWeightSum[bin] += log(k);
2667 }
2668 }
2669 }
2670 }
2671
2672 MPI_Allreduce(MPI_IN_PLACE, SumPowerUncorrected, BINS_PS, MPI_DOUBLE, MPI_SUM, Communicator);
2673 MPI_Allreduce(MPI_IN_PLACE, CountModes, BINS_PS, MPI_LONG_LONG, MPI_SUM, Communicator);
2674 MPI_Allreduce(MPI_IN_PLACE, KWeightSum, BINS_PS, MPI_DOUBLE, MPI_SUM, Communicator);
2675
2676 int count_non_zero_bins = 0;
2677 for(int i = 0; i < BINS_PS; i++)
2678 {
2679 if(CountModes[i] > 0)
2680 {
2681 Kbin[i] = exp(KWeightSum[i] / CountModes[i]);
2682 count_non_zero_bins++;
2683 }
2684 else
2685 Kbin[i] = exp((i + 0.5) / binfac + log(K0));
2686
2687 if(CountModes[i] > 0)
2688 PowerUncorrected[i] = SumPowerUncorrected[i] / CountModes[i];
2689 else
2690 PowerUncorrected[i] = 0;
2691
2692 DeltaUncorrected[i] = 4 * M_PI * pow(Kbin[i], 3) / pow(2 * M_PI / All.BoxSize, 3) * PowerUncorrected[i];
2693
2694 ShotLimit[i] = 4 * M_PI * pow(Kbin[i], 3) / pow(2 * M_PI / All.BoxSize, 3) * (mass2 / (mass * mass));
2695 }
2696
2697 /* store the result */
2698 if(ThisTask == 0)
2699 {
2700 FILE *fd;
2701
2702 if(flag == 0)
2703 {
2704 if(!(fd = fopen(power_spec_fname, "w"))) /* store the unfolded spectrum */
2705 Terminate("can't open file `%s`\n", power_spec_fname);
2706 }
2707 else if(flag == 1 || flag == 2)
2708 {
2709 if(!(fd = fopen(power_spec_fname, "a"))) /* append the file, store the folded spectrum */
2710 Terminate("can't open file `%s`\n", power_spec_fname);
2711 }
2712 else
2713 Terminate("Something wrong.\n");
2714
2715 fprintf(fd, "%g\n", All.Time);
2716 fprintf(fd, "%d\n", count_non_zero_bins);
2717 fprintf(fd, "%g\n", All.BoxSize);
2718 fprintf(fd, "%d\n", (int)(PMGRID));
2720 fprintf(fd, "%g\n", All.ComovingIntegrationOn > 0 ? linear_growth_factor(All.Time, 1.0) : 1.0);
2721
2722 for(int i = 0; i < BINS_PS; i++)
2723 if(CountModes[i] > 0)
2724 fprintf(fd, "%g %g %g %g %g\n", Kbin[i], DeltaUncorrected[i], PowerUncorrected[i], (double)CountModes[i], ShotLimit[i]);
2725
2726 if(flag == 2)
2727 {
2728 fprintf(fd, "%g\n", mass);
2729 fprintf(fd, "%g\n", count);
2730 fprintf(fd, "%g\n", mass * mass / mass2);
2731 }
2732
2733 fclose(fd);
2734 }
2735}
2736
2737#endif
global_data_all_processes All
Definition: main.cc:40
double timediff(double t0, double t1)
Definition: logs.cc:488
double second(void)
Definition: logs.cc:471
double getAllocatedBytesInMB(void)
Definition: mymalloc.h:71
#define RCUT
Definition: constants.h:293
#define MPI_MESSAGE_SIZELIMIT_IN_BYTES
Definition: constants.h:53
#define MAXLEN_PATH_EXTRA
Definition: constants.h:301
#define NTYPES
Definition: constants.h:308
#define M_PI
Definition: constants.h:56
#define ASMTH
Definition: constants.h:287
double mycxxsort(T *begin, T *end, Tcomp comp)
Definition: cxxsort.h:39
float MyFloat
Definition: dtypes.h:86
#define LONG_Y
Definition: dtypes.h:369
uint32_t MyIntPosType
Definition: dtypes.h:35
#define LONG_X
Definition: dtypes.h:361
#define LONG_Z
Definition: dtypes.h:377
logs Logs
Definition: main.cc:43
#define Terminate(...)
Definition: macros.h:15
void sumup_large_ints(int n, int *src, long long *res, MPI_Comm comm)
int myMPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
Definition: myalltoall.cc:301
int myMPI_Sendrecv(void *sendbuf, size_t sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf, size_t recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status)
#define TAG_NONPERIOD_B
Definition: mpi_utils.h:45
#define TAG_NONPERIOD_C
Definition: mpi_utils.h:46
void myMPI_Alltoallv(void *sendbuf, size_t *sendcounts, size_t *sdispls, void *recvbuf, size_t *recvcounts, size_t *rdispls, int len, int big_flag, MPI_Comm comm)
Definition: myalltoall.cc:181
#define TAG_NONPERIOD_A
Definition: mpi_utils.h:44
memory Mem
Definition: main.cc:44
expr exp(half arg)
Definition: half.hpp:2724
expr cos(half arg)
Definition: half.hpp:2823
expr sin(half arg)
Definition: half.hpp:2816
expr erf(half arg)
Definition: half.hpp:2918
expr log(half arg)
Definition: half.hpp:2745
expr pow(half base, half exp)
Definition: half.hpp:2803
expr erfc(half arg)
Definition: half.hpp:2925
expr sqrt(half arg)
Definition: half.hpp:2777
#define FFTW(x)
Definition: pm_mpi_fft.h:22
integertime Ti_Current
Definition: allvars.h:188
char OutputDir[MAXLEN_PATH]
Definition: allvars.h:272
MyDouble getMass(void)
vector< MyFloat > GravAccel
Definition: particle_data.h:55
MyIntPosType IntPos[3]
Definition: particle_data.h:53