GADGET-4
pm_nonperiodic.cc
Go to the documentation of this file.
1/*******************************************************************************
2 * \copyright This file is part of the GADGET4 N-body/SPH code developed
3 * \copyright by Volker Springel. Copyright (C) 2014-2020 by Volker Springel
4 * \copyright (vspringel@mpa-garching.mpg.de) and all contributing authors.
5 *******************************************************************************/
6
12#include "gadgetconfig.h"
13
14#if defined(PMGRID) && (!defined(PERIODIC) || defined(PLACEHIGHRESREGION))
15
16#include <fftw3.h>
17#include <math.h>
18#include <mpi.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22
23#include "../data/allvars.h"
24#include "../data/dtypes.h"
25#include "../data/intposconvert.h"
26#include "../data/mymalloc.h"
27#include "../logs/timer.h"
28#include "../main/simulation.h"
29#include "../mpi_utils/mpi_utils.h"
30#include "../pm/pm.h"
31#include "../pm/pm_mpi_fft.h"
32#include "../pm/pm_nonperiodic.h"
33#include "../sort/cxxsort.h"
34#include "../src/gravtree/gravtree.h"
35#include "../src/time_integration/timestep.h"
36#include "../system/system.h"
37
38#define GRID (HRPMGRID)
39#define GRIDz (GRID / 2 + 1)
40#define GRID2 (2 * GRIDz)
41
42#define FI(x, y, z) (((large_array_offset)GRID2) * (GRID * (x) + (y)) + (z))
43#define FC(c, z) (((large_array_offset)GRID2) * ((c)-myplan.firstcol_XY) + (z))
44#define TI(x, y, z) (((large_array_offset)GRID) * ((x) + (y)*myplan.nslab_x) + (z))
45
55void pm_nonperiodic::pm_init_regionsize(void)
56{
57 /* first, find a reference coordinate by selecting an arbitrary particle in the respective regions. For definiteness, we choose the
58 * first particle */
59
60 particle_data *P = Sp->P;
61 int have_low_mesh = NTask, have_high_mesh = NTask; /* default is we don't have a particle */
62
63 if(Sp->NumPart > 0)
64 {
65 for(int j = 0; j < 3; j++)
66 Sp->ReferenceIntPos[LOW_MESH][j] = P[0].IntPos[j];
67
68 have_low_mesh = ThisTask;
69 }
70
71 for(int i = 0; i < Sp->NumPart; i++)
72 {
73#ifdef PLACEHIGHRESREGION
74 if(((1 << P[i].getType()) & (PLACEHIGHRESREGION)))
75 {
76 for(int j = 0; j < 3; j++)
77 Sp->ReferenceIntPos[HIGH_MESH][j] = P[i].IntPos[j];
78
79 have_high_mesh = ThisTask;
80 break;
81 }
82#endif
83 }
84
85 int have_global[4] = {have_low_mesh, ThisTask, have_high_mesh, ThisTask};
86
87 MPI_Allreduce(MPI_IN_PLACE, have_global, 2, MPI_2INT, MPI_MINLOC, Communicator);
88
89 if(have_global[0] >= NTask)
90 Terminate("have_global[0] >= NTask: Don't we have any particle?");
91
92 MPI_Bcast(&Sp->ReferenceIntPos[LOW_MESH][0], 3 * sizeof(MyIntPosType), MPI_BYTE, have_global[1], Communicator);
93
94#ifdef PLACEHIGHRESREGION
95 if(have_global[2] >= NTask)
96 Terminate("have_global[2] >= NTask: Apparently there are no particles in high res region");
97
98 MPI_Bcast(&Sp->ReferenceIntPos[HIGH_MESH][0], 3 * sizeof(MyIntPosType), MPI_BYTE, have_global[3], Communicator);
99#endif
100
101 /* find enclosing rectangle */
102
103 MySignedIntPosType xmin[2][3], xmax[2][3];
104
105 for(int j = 0; j < 3; j++)
106 {
107 xmin[LOW_MESH][j] = xmin[HIGH_MESH][j] = 0;
108 xmax[LOW_MESH][j] = xmax[HIGH_MESH][j] = 0;
109 }
110
111 for(int i = 0; i < Sp->NumPart; i++)
112 {
113 MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[LOW_MESH][0], P[i].IntPos[1] - Sp->ReferenceIntPos[LOW_MESH][1],
114 P[i].IntPos[2] - Sp->ReferenceIntPos[LOW_MESH][2]};
115
116 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
117
118 for(int j = 0; j < 3; j++)
119 {
120 if(delta[j] > xmax[LOW_MESH][j])
121 xmax[LOW_MESH][j] = delta[j];
122 if(delta[j] < xmin[LOW_MESH][j])
123 xmin[LOW_MESH][j] = delta[j];
124 }
125
126#ifdef PLACEHIGHRESREGION
127 if(((1 << P[i].getType()) & (PLACEHIGHRESREGION)))
128 {
129 MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[HIGH_MESH][0],
130 P[i].IntPos[1] - Sp->ReferenceIntPos[HIGH_MESH][1],
131 P[i].IntPos[2] - Sp->ReferenceIntPos[HIGH_MESH][2]};
132
133 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
134
135 for(int j = 0; j < 3; j++)
136 {
137 if(delta[j] > xmax[HIGH_MESH][j])
138 xmax[HIGH_MESH][j] = delta[j];
139 if(delta[j] < xmin[HIGH_MESH][j])
140 xmin[HIGH_MESH][j] = delta[j];
141 }
142 }
143#endif
144 }
145
146 MPI_Allreduce(xmin, Sp->Xmintot, 6, MPI_MyIntPosType, MPI_MIN_MySignedIntPosType, Communicator);
147 MPI_Allreduce(xmax, Sp->Xmaxtot, 6, MPI_MyIntPosType, MPI_MAX_MySignedIntPosType, Communicator);
148
149 for(int i = 0; i < 3; i++)
150 {
151 Sp->Xmaxtot[LOW_MESH][i] += 1; /* so that all particles fulfill xmin <= pos < xmax instead of xmin <= pos <= xmax*/
152 Sp->Xmaxtot[HIGH_MESH][i] += 1;
153 }
154
155 MyIntPosType inner_meshsize[2], enclosing_meshsize[2];
156 int flag_recompute_kernel = 0;
157
158 for(int mesh = 0; mesh < 2; mesh++)
159 {
160 inner_meshsize[mesh] = (MyIntPosType)(Sp->Xmaxtot[mesh][0] - Sp->Xmintot[mesh][0]);
161
162 if((MyIntPosType)(Sp->Xmaxtot[mesh][1] - Sp->Xmintot[mesh][1]) > inner_meshsize[mesh])
163 inner_meshsize[mesh] = Sp->Xmaxtot[mesh][1] - Sp->Xmintot[mesh][1];
164
165 if((MyIntPosType)(Sp->Xmaxtot[mesh][2] - Sp->Xmintot[mesh][2]) > inner_meshsize[mesh])
166 inner_meshsize[mesh] = Sp->Xmaxtot[mesh][2] - Sp->Xmintot[mesh][2];
167 }
168
169 for(int mesh = 0; mesh < 2; mesh++)
170 {
171#ifdef PERIODIC
172 if(mesh == LOW_MESH)
173 continue;
174#endif
175#ifndef PLACEHIGHRESREGION
176 if(mesh == HIGH_MESH)
177 continue;
178#endif
179
180 MyIntPosType blocksize = 1;
181 MyIntPosType mask = ~((MyIntPosType)0);
182
183 if(mesh == LOW_MESH)
184 {
185 MyIntPosType refsize = inner_meshsize[mesh] >> 4; /* pick 1/8 as reference size */
186
187 for(int i = 0; i < BITS_FOR_POSITIONS; i++)
188 {
189 if(blocksize >= refsize)
190 break;
191
192 blocksize <<= 1;
193 mask <<= 1;
194 }
195 }
196 else
197 {
198 blocksize = Sp->PlacingBlocksize;
199 }
200
202 "PM-NONPERIODIC: Allowed region for isolated PM mesh (%s): BEFORE (%g|%g|%g) -> (%g|%g|%g) inner meshsize=%g "
203 "blocksize=%g\n",
204 mesh == LOW_MESH ? "coarse" : "fine", Sp->FacIntToCoord * Sp->Xmintot[mesh][0], Sp->FacIntToCoord * Sp->Xmintot[mesh][1],
205 Sp->FacIntToCoord * Sp->Xmintot[mesh][2], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][0], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][1],
206 Sp->FacIntToCoord * Sp->Xmaxtot[mesh][2], Sp->FacIntToCoord * inner_meshsize[mesh], Sp->FacIntToCoord * blocksize);
207
208 enclosing_meshsize[mesh] = 0;
209
210 /* expand the box so that it aligns with blocksize */
211 for(int i = 0; i < 3; i++)
212 {
213 MyIntPosType left, right;
214 if(mesh == LOW_MESH)
215 {
216 /* now round it down */
217 left = ((Sp->Xmintot[mesh][i] + Sp->Xmaxtot[mesh][i]) / 2 - inner_meshsize[mesh] / 2) + Sp->ReferenceIntPos[mesh][i];
218 left &= mask;
219
220 /* now round it up */
221 right = ((Sp->Xmintot[mesh][i] + Sp->Xmaxtot[mesh][i]) / 2 + inner_meshsize[mesh] / 2) + Sp->ReferenceIntPos[mesh][i];
222 right &= mask;
223 right += blocksize;
224 }
225 else
226 {
227 left = (Sp->ReferenceIntPos[HIGH_MESH][i] + Sp->Xmintot[HIGH_MESH][i]) & Sp->PlacingMask;
228 right = left + Sp->PlacingBlocksize;
229 }
230
231 Sp->Xmintot[mesh][i] = left - Sp->ReferenceIntPos[mesh][i];
232 Sp->Xmaxtot[mesh][i] = right - Sp->ReferenceIntPos[mesh][i];
233
234 Sp->Left[mesh][i] = left;
235
236 Sp->MeshSize[mesh][i] = Sp->Xmaxtot[mesh][i] - Sp->Xmintot[mesh][i];
237
238 if(Sp->MeshSize[mesh][i] > enclosing_meshsize[mesh])
239 enclosing_meshsize[mesh] = Sp->MeshSize[mesh][i];
240 }
241
243 "PM-NONPERIODIC: Allowed region for isolated PM mesh (%s): AFTER (%g|%g|%g) -> (%g|%g|%g) enclosing_meshsize=%g "
244 "absolute space (%g|%g|%g) -> (%g|%g|%g)\n",
245 mesh == LOW_MESH ? "coarse" : "fine", Sp->FacIntToCoord * Sp->Xmintot[mesh][0], Sp->FacIntToCoord * Sp->Xmintot[mesh][1],
246 Sp->FacIntToCoord * Sp->Xmintot[mesh][2], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][0], Sp->FacIntToCoord * Sp->Xmaxtot[mesh][1],
247 Sp->FacIntToCoord * Sp->Xmaxtot[mesh][2], Sp->FacIntToCoord * enclosing_meshsize[mesh],
248 Sp->FacIntToCoord * (Sp->Xmintot[mesh][0] + Sp->ReferenceIntPos[mesh][0]),
249 Sp->FacIntToCoord * (Sp->Xmintot[mesh][1] + Sp->ReferenceIntPos[mesh][1]),
250 Sp->FacIntToCoord * (Sp->Xmintot[mesh][2] + Sp->ReferenceIntPos[mesh][2]),
251 Sp->FacIntToCoord * (Sp->Xmaxtot[mesh][0] + Sp->ReferenceIntPos[mesh][0]),
252 Sp->FacIntToCoord * (Sp->Xmaxtot[mesh][1] + Sp->ReferenceIntPos[mesh][1]),
253 Sp->FacIntToCoord * (Sp->Xmaxtot[mesh][2] + Sp->ReferenceIntPos[mesh][2]));
254
255 if(enclosing_meshsize[mesh] != Sp->OldMeshSize[mesh])
256 {
257 flag_recompute_kernel = 1;
258 Sp->OldMeshSize[mesh] = enclosing_meshsize[mesh];
259 }
260
261 /* this will produce enough room for zero-padding and buffer region to
262 allow finite differencing of the potential */
263 Sp->TotalMeshSize[mesh] = 2.0 * enclosing_meshsize[mesh] * (GRID / ((double)(GRID - 10)));
264
265 /* move lower left corner by two cells to allow finite differencing of the potential by a 4-point function */
266 for(int i = 0; i < 3; i++)
267 Sp->Corner[mesh][i] = Sp->Xmintot[mesh][i] - (2.0 * Sp->TotalMeshSize[mesh] / GRID);
268
269 Sp->Asmth[mesh] = ASMTH * (Sp->FacIntToCoord * Sp->TotalMeshSize[mesh]) / GRID;
270 Sp->Rcut[mesh] = RCUT * Sp->Asmth[mesh];
271 }
272
273 static int first_init_done = 0; /* for detecting restart from restartfiles */
274 if(flag_recompute_kernel || first_init_done == 0)
275 {
276#ifndef PERIODIC
277 mpi_printf("PM-NONPERIODIC: Recompute kernel course mesh: Asmth=%g Rcut=%g mesh cell size=%g\n", Sp->Asmth[LOW_MESH],
278 Sp->Rcut[LOW_MESH], Sp->FacIntToCoord * Sp->TotalMeshSize[LOW_MESH] / GRID);
279#endif
280#ifdef PLACEHIGHRESREGION
281 mpi_printf("PM-NONPERIODIC: Recompute kernel fine mesh: Asmth=%g Rcut=%g mesh cell size=%g\n", Sp->Asmth[HIGH_MESH],
282 Sp->Rcut[HIGH_MESH], Sp->FacIntToCoord * Sp->TotalMeshSize[HIGH_MESH] / GRID);
283#endif
284
285 pm_setup_nonperiodic_kernel();
286 first_init_done = 1;
287 }
288}
289
294void pm_nonperiodic::pm_init_nonperiodic(simparticles *Sp_ptr)
295{
296 Sp = Sp_ptr;
297
298 /* Set up the FFTW-3 plan files. */
299 int ndim[1] = {GRID}; /* dimension of the 1D transforms */
300
301 /* temporarily allocate some arrays to make sure that out-of-place plans are created */
302 rhogrid = (fft_real *)Mem.mymalloc("rhogrid", GRID2 * sizeof(fft_real));
303 forcegrid = (fft_real *)Mem.mymalloc("forcegrid", GRID2 * sizeof(fft_real));
304
305#ifdef DOUBLEPRECISION_FFTW
306 int alignflag = 0;
307#else
308 /* for single precision, the start of our FFT columns is presently only guaranteed to be 8-byte aligned */
309 int alignflag = FFTW_UNALIGNED;
310#endif
311#ifndef FFT_COLUMN_BASED
312 int stride = GRIDz;
313#else
314 int stride = 1;
315#endif
316
317 myplan.forward_plan_zdir = FFTW(plan_many_dft_r2c)(1, ndim, 1, rhogrid, 0, 1, GRID2, (fft_complex *)forcegrid, 0, 1, GRIDz,
318 FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
319
320 myplan.forward_plan_xdir =
321 FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
322 GRIDz * GRID, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
323
324 myplan.forward_plan_ydir =
325 FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
326 GRIDz * GRID, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
327
328 myplan.backward_plan_zdir = FFTW(plan_many_dft_c2r)(1, ndim, 1, (fft_complex *)rhogrid, 0, 1, GRIDz, forcegrid, 0, 1, GRID2,
329 FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
330
331 myplan.backward_plan_xdir =
332 FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
333 GRIDz * GRID, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
334
335 myplan.backward_plan_ydir =
336 FFTW(plan_many_dft)(1, ndim, 1, (fft_complex *)rhogrid, 0, stride, GRIDz * GRID, (fft_complex *)forcegrid, 0, stride,
337 GRIDz * GRID, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_DESTROY_INPUT | alignflag);
338
339 Mem.myfree(forcegrid);
340 Mem.myfree(rhogrid);
341
342#ifndef FFT_COLUMN_BASED
343
344 my_slab_based_fft_init(&myplan, GRID, GRID, GRID);
345
346 maxfftsize = myplan.largest_x_slab * GRID * ((size_t)GRID2);
347
348#else
349
350 my_column_based_fft_init(&myplan, GRID, GRID, GRID);
351
352 maxfftsize = myplan.max_datasize;
353
354#endif
355
356 /* now allocate memory to hold the FFT fields */
357
358 size_t bytes, bytes_tot = 0;
359
360#if !defined(PERIODIC)
361 kernel[0] = (fft_real *)Mem.mymalloc("kernel[0]", bytes = maxfftsize * sizeof(fft_real));
362 bytes_tot += bytes;
363 fft_of_kernel[0] = (fft_complex *)kernel[0];
364#endif
365
366#if defined(PLACEHIGHRESREGION)
367 kernel[1] = (fft_real *)Mem.mymalloc("kernel[1]", bytes = maxfftsize * sizeof(fft_real));
368 bytes_tot += bytes;
369 fft_of_kernel[1] = (fft_complex *)kernel[1];
370#endif
371
372 mpi_printf("\nPM-NONPERIODIC: Allocated %g MByte for FFT kernel(s).\n\n", bytes_tot / (1024.0 * 1024.0));
373}
374
375#ifdef PM_ZOOM_OPTIMIZED
376
377void pm_nonperiodic::pmforce_nonperiodic_zoom_optimized_prepare_density(int grnr)
378{
379 MPI_Status status;
380
381 particle_data *P = Sp->P;
382
383 double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
384
385 part = (part_slab_data *)Mem.mymalloc("part", 8 * (NSource * sizeof(part_slab_data)));
386 large_numpart_type *part_sortindex =
387 (large_numpart_type *)Mem.mymalloc("part_sortindex", 8 * (NSource * sizeof(large_numpart_type)));
388
389 int ngrid = 0;
390
391#ifdef FFT_COLUMN_BASED
392 int columns = GRIDX * GRIDY;
393 int avg = (columns - 1) / NTask + 1;
394 int exc = NTask * avg - columns;
395 int tasklastsection = NTask - exc;
396 int pivotcol = tasklastsection * avg;
397#endif
398
399 /* determine the cells each particle accesses */
400 for(int idx = 0; idx < NSource; idx++)
401 {
402 int i = Sp->get_active_index(idx);
403
404 if(P[i].Ti_Current != All.Ti_Current)
405 Sp->drift_particle(&P[i], &Sp->SphP[i], All.Ti_Current);
406
407 MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
408 P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
409
410 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
411
412 if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
413 continue;
414
415 if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
416 continue;
417
418 if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
419 continue;
420
421 int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
422 int slab_y = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
423 int slab_z = (int)(to_slab_fac * (delta[2] - Sp->Corner[grnr][2]));
424 int myngrid;
425
426 myngrid = ngrid;
427 ngrid += 1;
428
429 large_numpart_type index_on_grid = ((large_numpart_type)myngrid) * 8;
430
431 for(int xx = 0; xx < 2; xx++)
432 for(int yy = 0; yy < 2; yy++)
433 for(int zz = 0; zz < 2; zz++)
434 {
435 int slab_xx = slab_x + xx;
436 int slab_yy = slab_y + yy;
437 int slab_zz = slab_z + zz;
438
439 if(slab_xx >= GRID)
440 slab_xx -= GRID;
441 if(slab_yy >= GRID)
442 slab_yy -= GRID;
443 if(slab_zz >= GRID)
444 slab_zz -= GRID;
445
446 large_array_offset offset = FI(slab_xx, slab_yy, slab_zz);
447
448 part[index_on_grid].partindex = (i << 3) + (xx << 2) + (yy << 1) + zz;
449 part[index_on_grid].globalindex = offset;
450 part_sortindex[index_on_grid] = index_on_grid;
451 index_on_grid++;
452 }
453 }
454
455 /* note: num_on_grid will be 8 times larger than the particle number, but num_field_points will generally be much smaller */
456 num_on_grid = ((large_numpart_type)ngrid) * 8;
457
458 /* bring the part-field into the order of the accessed cells. This allows the removal of duplicates */
459 mycxxsort(part_sortindex, part_sortindex + num_on_grid, pm_nonperiodic_sortindex_comparator(part));
460
461 large_array_offset num_field_points;
462
463 if(num_on_grid > 0)
464 num_field_points = 1;
465 else
466 num_field_points = 0;
467
468 /* determine the number of unique field points */
469 for(large_numpart_type i = 1; i < num_on_grid; i++)
470 {
471 if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
472 num_field_points++;
473 }
474
475 /* allocate the local field */
476 localfield_globalindex = (large_array_offset *)Mem.mymalloc_movable(&localfield_globalindex, "localfield_globalindex",
477 num_field_points * sizeof(large_array_offset));
478 localfield_data = (fft_real *)Mem.mymalloc_movable(&localfield_data, "localfield_data", num_field_points * sizeof(fft_real));
479 localfield_first = (size_t *)Mem.mymalloc_movable(&localfield_first, "localfield_first", NTask * sizeof(size_t));
480 localfield_sendcount = (size_t *)Mem.mymalloc_movable(&localfield_sendcount, "localfield_sendcount", NTask * sizeof(size_t));
481 localfield_offset = (size_t *)Mem.mymalloc_movable(&localfield_offset, "localfield_offset", NTask * sizeof(size_t));
482 localfield_recvcount = (size_t *)Mem.mymalloc_movable(&localfield_recvcount, "localfield_recvcount", NTask * sizeof(size_t));
483
484 for(int i = 0; i < NTask; i++)
485 {
486 localfield_first[i] = 0;
487 localfield_sendcount[i] = 0;
488 }
489
490 /* establish the cross link between the part[ ]-array and the local list of
491 * mesh points. Also, count on which CPU the needed field points are stored.
492 */
493 num_field_points = 0;
494 for(large_numpart_type i = 0; i < num_on_grid; i++)
495 {
496 if(i > 0)
497 if(part[part_sortindex[i]].globalindex != part[part_sortindex[i - 1]].globalindex)
498 num_field_points++;
499
500 part[part_sortindex[i]].localindex = num_field_points;
501
502 if(i > 0)
503 if(part[part_sortindex[i]].globalindex == part[part_sortindex[i - 1]].globalindex)
504 continue;
505
506 localfield_globalindex[num_field_points] = part[part_sortindex[i]].globalindex;
507
508#ifndef FFT_COLUMN_BASED
509 int slab = part[part_sortindex[i]].globalindex / (GRID * GRID2);
510 int task = myplan.slab_to_task[slab];
511#else
512 int task, column = part[part_sortindex[i]].globalindex / (GRID2);
513
514 if(column < pivotcol)
515 task = column / avg;
516 else
517 task = (column - pivotcol) / (avg - 1) + tasklastsection;
518#endif
519
520 if(localfield_sendcount[task] == 0)
521 localfield_first[task] = num_field_points;
522
523 localfield_sendcount[task]++;
524 }
525 num_field_points++;
526
527 localfield_offset[0] = 0;
528 for(int i = 1; i < NTask; i++)
529 localfield_offset[i] = localfield_offset[i - 1] + localfield_sendcount[i - 1];
530
531 Mem.myfree_movable(part_sortindex);
532 part_sortindex = NULL;
533
534 /* now bin the local particle data onto the mesh list */
535 for(large_numpart_type i = 0; i < num_field_points; i++)
536 localfield_data[i] = 0;
537
538 for(large_numpart_type i = 0; i < num_on_grid; i += 8)
539 {
540 int pindex = (part[i].partindex >> 3);
541
542 MyIntPosType diff[3] = {P[pindex].IntPos[0] - Sp->ReferenceIntPos[grnr][0], P[pindex].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
543 P[pindex].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
544
545 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
546
547 double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
548 double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
549 double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
550
551 int slab_x = (int)(dx);
552 int slab_y = (int)(dy);
553 int slab_z = (int)(dz);
554
555 dx -= slab_x;
556 dy -= slab_y;
557 dz -= slab_z;
558
559 double weight = P[pindex].getMass();
560
561 localfield_data[part[i + 0].localindex] += weight * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
562 localfield_data[part[i + 1].localindex] += weight * (1.0 - dx) * (1.0 - dy) * dz;
563 localfield_data[part[i + 2].localindex] += weight * (1.0 - dx) * dy * (1.0 - dz);
564 localfield_data[part[i + 3].localindex] += weight * (1.0 - dx) * dy * dz;
565 localfield_data[part[i + 4].localindex] += weight * (dx) * (1.0 - dy) * (1.0 - dz);
566 localfield_data[part[i + 5].localindex] += weight * (dx) * (1.0 - dy) * dz;
567 localfield_data[part[i + 6].localindex] += weight * (dx)*dy * (1.0 - dz);
568 localfield_data[part[i + 7].localindex] += weight * (dx)*dy * dz;
569 }
570
571 rhogrid = (fft_real *)Mem.mymalloc("rhogrid", maxfftsize * sizeof(fft_real));
572
573 /* clear local FFT-mesh density field */
574 for(large_array_offset ii = 0; ii < maxfftsize; ii++)
575 rhogrid[ii] = 0;
576
577 /* exchange data and add contributions to the local mesh-path */
578 myMPI_Alltoall(localfield_sendcount, sizeof(size_t), MPI_BYTE, localfield_recvcount, sizeof(size_t), MPI_BYTE, Communicator);
579
580 for(int level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
581 {
582 int recvTask = ThisTask ^ level;
583
584 if(recvTask < NTask)
585 {
586 if(level > 0)
587 {
588 import_data = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
589 import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
590 localfield_recvcount[recvTask] * sizeof(large_array_offset));
591
592 if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
593 {
594 myMPI_Sendrecv(localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
595 MPI_BYTE, recvTask, TAG_NONPERIOD_A, import_data, localfield_recvcount[recvTask] * sizeof(fft_real),
596 MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
597
598 myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
599 localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_B,
600 import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
601 TAG_NONPERIOD_B, Communicator, &status);
602 }
603 }
604 else
605 {
606 import_data = localfield_data + localfield_offset[ThisTask];
607 import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
608 }
609
610 /* note: here every element in rhogrid is only accessed once, so there should be no race condition */
611 for(large_numpart_type i = 0; i < localfield_recvcount[recvTask]; i++)
612 {
613 /* determine offset in local FFT slab */
614#ifndef FFT_COLUMN_BASED
615 large_array_offset offset =
616 import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRID * ((large_array_offset)GRID2);
617#else
618 large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
619#endif
620 rhogrid[offset] += import_data[i];
621 }
622
623 if(level > 0)
624 {
625 Mem.myfree(import_globalindex);
626 Mem.myfree(import_data);
627 }
628 }
629 }
630}
631
632/* Function to read out the force component corresponding to spatial dimension 'dim'.
633 * If dim is negative, potential values are read out and assigned to particles.
634 */
635void pm_nonperiodic::pmforce_nonperiodic_zoom_optimized_readout_forces_or_potential(int grnr, int dim)
636{
637#ifdef EVALPOTENTIAL
638 double fac = 1.0 / (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / pow(GRID, 3); /* to get potential */
639#endif
640
641 particle_data *P = Sp->P;
642
643 MPI_Status status;
644
645 fft_real *grid;
646
647 if(dim < 0)
648 grid = rhogrid;
649 else
650 grid = forcegrid;
651
652 double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
653
654 for(int level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */
655 {
656 int recvTask = ThisTask ^ level;
657
658 if(recvTask < NTask)
659 {
660 if(level > 0)
661 {
662 import_data = (fft_real *)Mem.mymalloc("import_data", localfield_recvcount[recvTask] * sizeof(fft_real));
663 import_globalindex = (large_array_offset *)Mem.mymalloc("import_globalindex",
664 localfield_recvcount[recvTask] * sizeof(large_array_offset));
665
666 if(localfield_sendcount[recvTask] > 0 || localfield_recvcount[recvTask] > 0)
667 {
668 myMPI_Sendrecv(localfield_globalindex + localfield_offset[recvTask],
669 localfield_sendcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask, TAG_NONPERIOD_C,
670 import_globalindex, localfield_recvcount[recvTask] * sizeof(large_array_offset), MPI_BYTE, recvTask,
671 TAG_NONPERIOD_C, Communicator, &status);
672 }
673 }
674 else
675 {
676 import_data = localfield_data + localfield_offset[ThisTask];
677 import_globalindex = localfield_globalindex + localfield_offset[ThisTask];
678 }
679
680 for(large_numpart_type i = 0; i < localfield_recvcount[recvTask]; i++)
681 {
682#ifndef FFT_COLUMN_BASED
683 large_array_offset offset =
684 import_globalindex[i] - myplan.first_slab_x_of_task[ThisTask] * GRID * ((large_array_offset)GRID2);
685#else
686 large_array_offset offset = import_globalindex[i] - myplan.firstcol_XY * ((large_array_offset)GRID2);
687#endif
688 import_data[i] = grid[offset];
689 }
690
691 if(level > 0)
692 {
693 myMPI_Sendrecv(import_data, localfield_recvcount[recvTask] * sizeof(fft_real), MPI_BYTE, recvTask, TAG_NONPERIOD_A,
694 localfield_data + localfield_offset[recvTask], localfield_sendcount[recvTask] * sizeof(fft_real),
695 MPI_BYTE, recvTask, TAG_NONPERIOD_A, Communicator, &status);
696
697 Mem.myfree(import_globalindex);
698 Mem.myfree(import_data);
699 }
700 }
701 }
702
703 /* read out the force/potential values, which all have been assembled in localfield_data */
704
705 int ngrid = (num_on_grid >> 3);
706
707 for(int k = 0; k < ngrid; k++)
708 {
709 large_numpart_type j = (((large_numpart_type)k) << 3);
710
711 int i = (part[j].partindex >> 3);
712
713#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
714 if(!Sp->TimeBinSynchronized[P[i].TimeBinGrav])
715 continue;
716#endif
717
718 MyIntPosType diff[3] = {P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
719 P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
720
721 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
722
723 double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
724 double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
725 double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
726
727 int slab_x = (int)(dx);
728 int slab_y = (int)(dy);
729 int slab_z = (int)(dz);
730
731 dx -= slab_x;
732 dy -= slab_y;
733 dz -= slab_z;
734
735 double value = +localfield_data[part[j + 0].localindex] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
736 localfield_data[part[j + 1].localindex] * (1.0 - dx) * (1.0 - dy) * dz +
737 localfield_data[part[j + 2].localindex] * (1.0 - dx) * dy * (1.0 - dz) +
738 localfield_data[part[j + 3].localindex] * (1.0 - dx) * dy * dz +
739 localfield_data[part[j + 4].localindex] * (dx) * (1.0 - dy) * (1.0 - dz) +
740 localfield_data[part[j + 5].localindex] * (dx) * (1.0 - dy) * dz +
741 localfield_data[part[j + 6].localindex] * (dx)*dy * (1.0 - dz) +
742 localfield_data[part[j + 7].localindex] * (dx)*dy * dz;
743
744 if(dim < 0)
745 {
746#ifdef EVALPOTENTIAL
747 P[i].Potential += value * fac;
748#endif
749 }
750 else
751 {
752 Sp->P[i].GravAccel[dim] += value;
753 }
754 }
755}
756
757#else
758
759void pm_nonperiodic::pmforce_nonperiodic_uniform_optimized_prepare_density(int grnr)
760{
761 double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
762
763 Sndpm_count = (size_t *)Mem.mymalloc("Sndpm_count", NTask * sizeof(size_t));
764 Sndpm_offset = (size_t *)Mem.mymalloc("Sndpm_offset", NTask * sizeof(size_t));
765 Rcvpm_count = (size_t *)Mem.mymalloc("Rcvpm_count", NTask * sizeof(size_t));
766 Rcvpm_offset = (size_t *)Mem.mymalloc("Rcvpm_offset", NTask * sizeof(size_t));
767
768#ifdef FFT_COLUMN_BASED
769 int columns = GRIDX * GRIDY;
770 int avg = (columns - 1) / NTask + 1;
771 int exc = NTask * avg - columns;
772 int tasklastsection = NTask - exc;
773 int pivotcol = tasklastsection * avg;
774#endif
775
776 /* determine the slabs/columns each particles accesses */
777 {
778 size_t *send_count = Sndpm_count;
779
780 for(int j = 0; j < NTask; j++)
781 send_count[j] = 0;
782
783 for(int idx = 0; idx < NSource; idx++)
784 {
785 int i = Sp->get_active_index(idx);
786
787 if(Sp->P[i].Ti_Current != All.Ti_Current)
788 Sp->drift_particle(&Sp->P[i], &Sp->SphP[i], All.Ti_Current);
789
790 MyIntPosType diff[3] = {Sp->P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], Sp->P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
791 Sp->P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
792
793 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
794
795 if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
796 continue;
797
798 if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
799 continue;
800
801 if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
802 continue;
803
804 int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
805 int slab_xx = slab_x + 1;
806
807#ifndef FFT_COLUMN_BASED
808 int task0 = myplan.slab_to_task[slab_x];
809 int task1 = myplan.slab_to_task[slab_xx];
810
811 send_count[task0]++;
812 if(task0 != task1)
813 send_count[task1]++;
814#else
815 int slab_y = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
816 int slab_yy = slab_y + 1;
817
818 int column0 = slab_x * GRID + slab_y;
819 int column1 = slab_x * GRID + slab_yy;
820 int column2 = slab_xx * GRID + slab_y;
821 int column3 = slab_xx * GRID + slab_yy;
822
823 int task0, task1, task2, task3;
824
825 if(column0 < pivotcol)
826 task0 = column0 / avg;
827 else
828 task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
829
830 if(column1 < pivotcol)
831 task1 = column1 / avg;
832 else
833 task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
834
835 if(column2 < pivotcol)
836 task2 = column2 / avg;
837 else
838 task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
839
840 if(column3 < pivotcol)
841 task3 = column3 / avg;
842 else
843 task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
844
845 send_count[task0]++;
846 if(task1 != task0)
847 send_count[task1]++;
848 if(task2 != task1 && task2 != task0)
849 send_count[task2]++;
850 if(task3 != task0 && task3 != task1 && task3 != task2)
851 send_count[task3]++;
852#endif
853 }
854 }
855
856 /* collect thread-specific offset table and collect the results from the other threads */
857 Sndpm_offset[0] = 0;
858 for(int i = 1; i < NTask; i++)
859 {
860 int ind = i;
861 int ind_prev = i - 1;
862
863 Sndpm_offset[ind] = Sndpm_offset[ind_prev] + Sndpm_count[ind_prev];
864 }
865
866 myMPI_Alltoall(Sndpm_count, sizeof(size_t), MPI_BYTE, Rcvpm_count, sizeof(size_t), MPI_BYTE, Communicator);
867
868 nimport = 0, nexport = 0, Rcvpm_offset[0] = 0, Sndpm_offset[0] = 0;
869 for(int j = 0; j < NTask; j++)
870 {
871 nexport += Sndpm_count[j];
872 nimport += Rcvpm_count[j];
873
874 if(j > 0)
875 {
876 Sndpm_offset[j] = Sndpm_offset[j - 1] + Sndpm_count[j - 1];
877 Rcvpm_offset[j] = Rcvpm_offset[j - 1] + Rcvpm_count[j - 1];
878 }
879 }
880
881 /* allocate import and export buffer */
882 partin = (partbuf *)Mem.mymalloc("partin", nimport * sizeof(partbuf));
883 partout = (partbuf *)Mem.mymalloc("partout", nexport * sizeof(partbuf));
884
885 {
886 size_t *send_count = Sndpm_count;
887 size_t *send_offset = Sndpm_offset;
888
889 for(int j = 0; j < NTask; j++)
890 send_count[j] = 0;
891
892 /* fill export buffer */
893 for(int idx = 0; idx < NSource; idx++)
894 {
895 int i = Sp->get_active_index(idx);
896
897 MyIntPosType diff[3] = {Sp->P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], Sp->P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
898 Sp->P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
899
900 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
901
902 if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
903 continue;
904
905 if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
906 continue;
907
908 if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
909 continue;
910
911 int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
912 int slab_xx = slab_x + 1;
913
914#ifndef FFT_COLUMN_BASED
915 int task0 = myplan.slab_to_task[slab_x];
916 int task1 = myplan.slab_to_task[slab_xx];
917
918 size_t ind0 = send_offset[task0] + send_count[task0]++;
919 partout[ind0].Mass = Sp->P[i].getMass();
920 for(int j = 0; j < 3; j++)
921 partout[ind0].IntPos[j] = Sp->P[i].IntPos[j];
922
923 if(task0 != task1)
924 {
925 size_t ind1 = send_offset[task1] + send_count[task1]++;
926 partout[ind1].Mass = Sp->P[i].getMass();
927 for(int j = 0; j < 3; j++)
928 partout[ind1].IntPos[j] = Sp->P[i].IntPos[j];
929 }
930#else
931 int slab_y = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
932 int slab_yy = slab_y + 1;
933
934 int column0 = slab_x * GRID + slab_y;
935 int column1 = slab_x * GRID + slab_yy;
936 int column2 = slab_xx * GRID + slab_y;
937 int column3 = slab_xx * GRID + slab_yy;
938
939 int task0, task1, task2, task3;
940
941 if(column0 < pivotcol)
942 task0 = column0 / avg;
943 else
944 task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
945
946 if(column1 < pivotcol)
947 task1 = column1 / avg;
948 else
949 task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
950
951 if(column2 < pivotcol)
952 task2 = column2 / avg;
953 else
954 task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
955
956 if(column3 < pivotcol)
957 task3 = column3 / avg;
958 else
959 task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
960
961 size_t ind0 = send_offset[task0] + send_count[task0]++;
962 partout[ind0].Mass = Sp->P[i].getMass();
963 for(int j = 0; j < 3; j++)
964 partout[ind0].IntPos[j] = Sp->P[i].IntPos[j];
965
966 if(task1 != task0)
967 {
968 size_t ind1 = send_offset[task1] + send_count[task1]++;
969 partout[ind1].Mass = Sp->P[i].getMass();
970 for(int j = 0; j < 3; j++)
971 partout[ind1].IntPos[j] = Sp->P[i].IntPos[j];
972 }
973 if(task2 != task1 && task2 != task0)
974 {
975 size_t ind2 = send_offset[task2] + send_count[task2]++;
976 partout[ind2].Mass = Sp->P[i].getMass();
977 for(int j = 0; j < 3; j++)
978 partout[ind2].IntPos[j] = Sp->P[i].IntPos[j];
979 }
980 if(task3 != task0 && task3 != task1 && task3 != task2)
981 {
982 size_t ind3 = send_offset[task3] + send_count[task3]++;
983 partout[ind3].Mass = Sp->P[i].getMass();
984 for(int j = 0; j < 3; j++)
985 partout[ind3].IntPos[j] = Sp->P[i].IntPos[j];
986 }
987#endif
988 }
989 }
990
991 int flag_big = 0, flag_big_all;
992 for(int i = 0; i < NTask; i++)
993 if(Sndpm_count[i] * sizeof(partbuf) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
994 flag_big = 1;
995
996 /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
997 * transfer the data in chunks.
998 */
999 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1000
1001 /* exchange particle data */
1002 myMPI_Alltoallv(partout, Sndpm_count, Sndpm_offset, partin, Rcvpm_count, Rcvpm_offset, sizeof(partbuf), flag_big_all, Communicator);
1003
1004 Mem.myfree(partout);
1005
1006 /* allocate density field */
1007 rhogrid = (fft_real *)Mem.mymalloc("rhogrid", maxfftsize * sizeof(fft_real));
1008
1009 /* clear local FFT-mesh density field */
1010 for(size_t ii = 0; ii < maxfftsize; ii++)
1011 rhogrid[ii] = 0;
1012
1013#ifndef FFT_COLUMN_BASED
1014 /* bin particle data onto mesh, in multi-threaded fashion */
1015 {
1016 int first_y, count_y;
1017 subdivide_evenly(GRID, 1, 0, &first_y, &count_y);
1018 int last_y = first_y + count_y - 1;
1019
1020 for(size_t i = 0; i < nimport; i++)
1021 {
1022 MyIntPosType diff[3] = {partin[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], partin[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
1023 partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
1024
1025 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
1026
1027 double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
1028 int slab_y = (int)(dy);
1029 dy -= slab_y;
1030
1031 int slab_yy = slab_y + 1;
1032 int flag_slab_y, flag_slab_yy;
1033
1034 if(slab_y >= first_y && slab_y <= last_y)
1035 flag_slab_y = 1;
1036 else
1037 flag_slab_y = 0;
1038
1039 if(slab_yy >= first_y && slab_yy <= last_y)
1040 flag_slab_yy = 1;
1041 else
1042 flag_slab_yy = 0;
1043
1044 if(flag_slab_y || flag_slab_yy)
1045 {
1046 double mass = partin[i].Mass;
1047
1048 double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
1049 double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
1050 int slab_x = (int)(dx);
1051 int slab_z = (int)(dz);
1052 dx -= slab_x;
1053 dz -= slab_z;
1054
1055 int slab_xx = slab_x + 1;
1056 int slab_zz = slab_z + 1;
1057
1058 int flag_slab_x, flag_slab_xx;
1059
1060 if(myplan.slab_to_task[slab_x] == ThisTask)
1061 {
1062 slab_x -= myplan.first_slab_x_of_task[ThisTask];
1063 flag_slab_x = 1;
1064 }
1065 else
1066 flag_slab_x = 0;
1067
1068 if(myplan.slab_to_task[slab_xx] == ThisTask)
1069 {
1070 slab_xx -= myplan.first_slab_x_of_task[ThisTask];
1071 flag_slab_xx = 1;
1072 }
1073 else
1074 flag_slab_xx = 0;
1075
1076 if(flag_slab_x)
1077 {
1078 if(flag_slab_y)
1079 {
1080 rhogrid[FI(slab_x, slab_y, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
1081 rhogrid[FI(slab_x, slab_y, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
1082 }
1083
1084 if(flag_slab_yy)
1085 {
1086 rhogrid[FI(slab_x, slab_yy, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
1087 rhogrid[FI(slab_x, slab_yy, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
1088 }
1089 }
1090
1091 if(flag_slab_xx)
1092 {
1093 if(flag_slab_y)
1094 {
1095 rhogrid[FI(slab_xx, slab_y, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
1096 rhogrid[FI(slab_xx, slab_y, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
1097 }
1098
1099 if(flag_slab_yy)
1100 {
1101 rhogrid[FI(slab_xx, slab_yy, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
1102 rhogrid[FI(slab_xx, slab_yy, slab_zz)] += (mass * (dx) * (dy) * (dz));
1103 }
1104 }
1105 }
1106 }
1107 }
1108
1109#else
1110
1111 struct data_cols
1112 {
1113 int col0, col1, col2, col3;
1114 double dx, dy;
1115 };
1116
1117 data_cols *aux = (data_cols *)Mem.mymalloc("aux", nimport * sizeof(data_cols));
1118
1119 for(int i = 0; i < nimport; i++)
1120 {
1121 MyIntPosType diff[3] = {partin[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], partin[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
1122 partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
1123
1124 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
1125
1126 aux[i].dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
1127 aux[i].dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
1128
1129 int slab_x = (int)(aux[i].dx);
1130 int slab_y = (int)(aux[i].dy);
1131
1132 aux[i].dx -= slab_x;
1133 aux[i].dy -= slab_y;
1134
1135 int slab_xx = slab_x + 1;
1136 int slab_yy = slab_y + 1;
1137
1138 aux[i].col0 = slab_x * GRID + slab_y;
1139 aux[i].col1 = slab_x * GRID + slab_yy;
1140 aux[i].col2 = slab_xx * GRID + slab_y;
1141 aux[i].col3 = slab_xx * GRID + slab_yy;
1142 }
1143
1144 {
1145 int first_col, last_col, count_col;
1146 subdivide_evenly(myplan.ncol_XY, 1, 0, &first_col, &count_col);
1147 last_col = first_col + count_col - 1;
1148 first_col += myplan.firstcol_XY;
1149 last_col += myplan.firstcol_XY;
1150
1151 for(int i = 0; i < nimport; i++)
1152 {
1153 int flag0, flag1, flag2, flag3;
1154 int col0 = aux[i].col0;
1155 int col1 = aux[i].col1;
1156 int col2 = aux[i].col2;
1157 int col3 = aux[i].col3;
1158
1159 if(col0 >= first_col && col0 <= last_col)
1160 flag0 = 1;
1161 else
1162 flag0 = 0;
1163
1164 if(col1 >= first_col && col1 <= last_col)
1165 flag1 = 1;
1166 else
1167 flag1 = 0;
1168
1169 if(col2 >= first_col && col2 <= last_col)
1170 flag2 = 1;
1171 else
1172 flag2 = 0;
1173
1174 if(col3 >= first_col && col3 <= last_col)
1175 flag3 = 1;
1176 else
1177 flag3 = 0;
1178
1179 if(flag0 || flag1 || flag2 || flag3)
1180 {
1181 double mass = partin[i].Mass;
1182
1183 double dx = aux[i].dx;
1184 double dy = aux[i].dy;
1185
1186 MySignedIntPosType deltaz = (MySignedIntPosType)(partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]);
1187
1188 double dz = to_slab_fac * (deltaz - Sp->Corner[grnr][2]);
1189 int slab_z = (int)(dz);
1190 dz -= slab_z;
1191 int slab_zz = slab_z + 1;
1192
1193 if(flag0)
1194 {
1195 rhogrid[FC(col0, slab_z)] += (mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz));
1196 rhogrid[FC(col0, slab_zz)] += (mass * (1.0 - dx) * (1.0 - dy) * (dz));
1197 }
1198
1199 if(flag1)
1200 {
1201 rhogrid[FC(col1, slab_z)] += (mass * (1.0 - dx) * (dy) * (1.0 - dz));
1202 rhogrid[FC(col1, slab_zz)] += (mass * (1.0 - dx) * (dy) * (dz));
1203 }
1204
1205 if(flag2)
1206 {
1207 rhogrid[FC(col2, slab_z)] += (mass * (dx) * (1.0 - dy) * (1.0 - dz));
1208 rhogrid[FC(col2, slab_zz)] += (mass * (dx) * (1.0 - dy) * (dz));
1209 }
1210
1211 if(flag3)
1212 {
1213 rhogrid[FC(col3, slab_z)] += (mass * (dx) * (dy) * (1.0 - dz));
1214 rhogrid[FC(col3, slab_zz)] += (mass * (dx) * (dy) * (dz));
1215 }
1216 }
1217 }
1218 }
1219
1220 Mem.myfree(aux);
1221
1222#endif
1223}
1224
1225/* If dim<0, this function reads out the potential, otherwise Cartesian force components.
1226 */
1227void pm_nonperiodic::pmforce_nonperiodic_uniform_optimized_readout_forces_or_potential(int grnr, int dim)
1228{
1229#ifdef EVALPOTENTIAL
1230 double fac = 1.0 / (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / pow(GRID, 3); /* to get potential */
1231#endif
1232
1233 double to_slab_fac = GRID / ((double)Sp->TotalMeshSize[grnr]);
1234
1235 double *flistin = (double *)Mem.mymalloc("flistin", nimport * sizeof(double));
1236 double *flistout = (double *)Mem.mymalloc("flistout", nexport * sizeof(double));
1237
1238 fft_real *grid;
1239
1240 if(dim < 0)
1241 grid = rhogrid;
1242 else
1243 grid = forcegrid;
1244
1245#ifdef FFT_COLUMN_BASED
1246 int columns = GRIDX * GRIDY;
1247 int avg = (columns - 1) / NTask + 1;
1248 int exc = NTask * avg - columns;
1249 int tasklastsection = NTask - exc;
1250 int pivotcol = tasklastsection * avg;
1251#endif
1252
1253 for(size_t i = 0; i < nimport; i++)
1254 {
1255 flistin[i] = 0;
1256
1257 MyIntPosType diff[3] = {partin[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], partin[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
1258 partin[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
1259
1260 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
1261
1262 double dx = to_slab_fac * (delta[0] - Sp->Corner[grnr][0]);
1263 double dy = to_slab_fac * (delta[1] - Sp->Corner[grnr][1]);
1264 double dz = to_slab_fac * (delta[2] - Sp->Corner[grnr][2]);
1265
1266 int slab_x = (int)(dx);
1267 int slab_y = (int)(dy);
1268 int slab_z = (int)(dz);
1269
1270 dx -= slab_x;
1271 dy -= slab_y;
1272 dz -= slab_z;
1273
1274 int slab_xx = slab_x + 1;
1275 int slab_yy = slab_y + 1;
1276 int slab_zz = slab_z + 1;
1277
1278#ifndef FFT_COLUMN_BASED
1279 if(myplan.slab_to_task[slab_x] == ThisTask)
1280 {
1281 slab_x -= myplan.first_slab_x_of_task[ThisTask];
1282
1283 flistin[i] += +grid[FI(slab_x, slab_y, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1284 grid[FI(slab_x, slab_y, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz) +
1285 grid[FI(slab_x, slab_yy, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) +
1286 grid[FI(slab_x, slab_yy, slab_zz)] * (1.0 - dx) * (dy) * (dz);
1287 }
1288
1289 if(myplan.slab_to_task[slab_xx] == ThisTask)
1290 {
1291 slab_xx -= myplan.first_slab_x_of_task[ThisTask];
1292
1293 flistin[i] += +grid[FI(slab_xx, slab_y, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) +
1294 grid[FI(slab_xx, slab_y, slab_zz)] * (dx) * (1.0 - dy) * (dz) +
1295 grid[FI(slab_xx, slab_yy, slab_z)] * (dx) * (dy) * (1.0 - dz) +
1296 grid[FI(slab_xx, slab_yy, slab_zz)] * (dx) * (dy) * (dz);
1297 }
1298#else
1299 int column0 = slab_x * GRID + slab_y;
1300 int column1 = slab_x * GRID + slab_yy;
1301 int column2 = slab_xx * GRID + slab_y;
1302 int column3 = slab_xx * GRID + slab_yy;
1303
1304 if(column0 >= myplan.firstcol_XY && column0 <= myplan.lastcol_XY)
1305 {
1306 flistin[i] += +grid[FC(column0, slab_z)] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz) +
1307 grid[FC(column0, slab_zz)] * (1.0 - dx) * (1.0 - dy) * (dz);
1308 }
1309 if(column1 >= myplan.firstcol_XY && column1 <= myplan.lastcol_XY)
1310 {
1311 flistin[i] +=
1312 +grid[FC(column1, slab_z)] * (1.0 - dx) * (dy) * (1.0 - dz) + grid[FC(column1, slab_zz)] * (1.0 - dx) * (dy) * (dz);
1313 }
1314
1315 if(column2 >= myplan.firstcol_XY && column2 <= myplan.lastcol_XY)
1316 {
1317 flistin[i] +=
1318 +grid[FC(column2, slab_z)] * (dx) * (1.0 - dy) * (1.0 - dz) + grid[FC(column2, slab_zz)] * (dx) * (1.0 - dy) * (dz);
1319 }
1320
1321 if(column3 >= myplan.firstcol_XY && column3 <= myplan.lastcol_XY)
1322 {
1323 flistin[i] += +grid[FC(column3, slab_z)] * (dx) * (dy) * (1.0 - dz) + grid[FC(column3, slab_zz)] * (dx) * (dy) * (dz);
1324 }
1325#endif
1326 }
1327
1328 /* exchange the potential component data */
1329 int flag_big = 0, flag_big_all;
1330 for(int i = 0; i < NTask; i++)
1331 if(Sndpm_count[i] * sizeof(double) > MPI_MESSAGE_SIZELIMIT_IN_BYTES)
1332 flag_big = 1;
1333
1334 /* produce a flag if any of the send sizes is above our transfer limit, in this case we will
1335 * transfer the data in chunks.
1336 */
1337 MPI_Allreduce(&flag_big, &flag_big_all, 1, MPI_INT, MPI_MAX, Communicator);
1338
1339 /* exchange data */
1340 myMPI_Alltoallv(flistin, Rcvpm_count, Rcvpm_offset, flistout, Sndpm_count, Sndpm_offset, sizeof(double), flag_big_all, Communicator);
1341
1342 /* now assign them to the correct particles */
1343
1344 size_t *send_count = Sndpm_count;
1345 size_t *send_offset = Sndpm_offset;
1346
1347 for(int j = 0; j < NTask; j++)
1348 send_count[j] = 0;
1349
1350 for(int idx = 0; idx < NSource; idx++)
1351 {
1352 int i = Sp->get_active_index(idx);
1353
1354 MyIntPosType diff[3] = {Sp->P[i].IntPos[0] - Sp->ReferenceIntPos[grnr][0], Sp->P[i].IntPos[1] - Sp->ReferenceIntPos[grnr][1],
1355 Sp->P[i].IntPos[2] - Sp->ReferenceIntPos[grnr][2]};
1356
1357 MySignedIntPosType *delta = (MySignedIntPosType *)diff;
1358
1359 if(delta[0] < Sp->Xmintot[grnr][0] || delta[0] >= Sp->Xmaxtot[grnr][0])
1360 continue;
1361
1362 if(delta[1] < Sp->Xmintot[grnr][1] || delta[1] >= Sp->Xmaxtot[grnr][1])
1363 continue;
1364
1365 if(delta[2] < Sp->Xmintot[grnr][2] || delta[2] >= Sp->Xmaxtot[grnr][2])
1366 continue;
1367
1368 int slab_x = (int)(to_slab_fac * (delta[0] - Sp->Corner[grnr][0]));
1369 int slab_xx = slab_x + 1;
1370
1371#ifndef FFT_COLUMN_BASED
1372 int task0 = myplan.slab_to_task[slab_x];
1373 int task1 = myplan.slab_to_task[slab_xx];
1374
1375 double value = flistout[send_offset[task0] + send_count[task0]++];
1376
1377 if(task0 != task1)
1378 value += flistout[send_offset[task1] + send_count[task1]++];
1379#else
1380 int slab_y = (int)(to_slab_fac * (delta[1] - Sp->Corner[grnr][1]));
1381 int slab_yy = slab_y + 1;
1382
1383 int column0 = slab_x * GRID + slab_y;
1384 int column1 = slab_x * GRID + slab_yy;
1385 int column2 = slab_xx * GRID + slab_y;
1386 int column3 = slab_xx * GRID + slab_yy;
1387
1388 int task0, task1, task2, task3;
1389
1390 if(column0 < pivotcol)
1391 task0 = column0 / avg;
1392 else
1393 task0 = (column0 - pivotcol) / (avg - 1) + tasklastsection;
1394
1395 if(column1 < pivotcol)
1396 task1 = column1 / avg;
1397 else
1398 task1 = (column1 - pivotcol) / (avg - 1) + tasklastsection;
1399
1400 if(column2 < pivotcol)
1401 task2 = column2 / avg;
1402 else
1403 task2 = (column2 - pivotcol) / (avg - 1) + tasklastsection;
1404
1405 if(column3 < pivotcol)
1406 task3 = column3 / avg;
1407 else
1408 task3 = (column3 - pivotcol) / (avg - 1) + tasklastsection;
1409
1410 double value = flistout[send_offset[task0] + send_count[task0]++];
1411
1412 if(task1 != task0)
1413 value += flistout[send_offset[task1] + send_count[task1]++];
1414
1415 if(task2 != task1 && task2 != task0)
1416 value += flistout[send_offset[task2] + send_count[task2]++];
1417
1418 if(task3 != task0 && task3 != task1 && task3 != task2)
1419 value += flistout[send_offset[task3] + send_count[task3]++];
1420#endif
1421
1422#if !defined(HIERARCHICAL_GRAVITY) && defined(TREEPM_NOTIMESPLIT)
1423 if(!Sp->TimeBinSynchronized[Sp->P[i].TimeBinGrav])
1424 continue;
1425#endif
1426
1427 if(dim < 0)
1428 {
1429#ifdef EVALPOTENTIAL
1430 Sp->P[i].Potential += value * fac;
1431#endif
1432 }
1433 else
1434 {
1435 Sp->P[i].GravAccel[dim] += value;
1436 }
1437 }
1438
1439 Mem.myfree(flistout);
1440 Mem.myfree(flistin);
1441}
1442#endif
1443
1451int pm_nonperiodic::pmforce_nonperiodic(int grnr)
1452{
1453 double tstart = Logs.second();
1454
1455 mpi_printf("PM-NONPERIODIC: Starting non-periodic PM calculation (Rcut=%g, grid=%d) presently allocated=%g MB.\n", Sp->Rcut[grnr],
1456 grnr, Mem.getAllocatedBytesInMB());
1457
1458 if(grnr == 1 && Sp->Rcut[1] > Sp->Rcut[0])
1459 Terminate(
1460 "We have Sp->Rcut[1]=%g > Sp->Rcut[0]=%g, which means that the high-res cut-off is larger than the normal one... this is "
1461 "not good.",
1462 Sp->Rcut[1], Sp->Rcut[0]);
1463
1464#ifdef HIERARCHICAL_GRAVITY
1465 NSource = Sp->TimeBinsGravity.NActiveParticles;
1466#else
1467 NSource = Sp->NumPart;
1468#endif
1469
1470#ifndef TREEPM_NOTIMESPLIT
1471 if(NSource != Sp->NumPart)
1472 Terminate("unexpected NSource != Sp->NumPart");
1473#endif
1474
1475#ifndef NUMPART_PER_TASK_LARGE
1476 if((((long long)Sp->NumPart) << 3) >= (((long long)1) << 31))
1477 Terminate("We are dealing with a too large particle number per MPI rank - enabling NUMPART_PER_TASK_LARGE might help.");
1478#endif
1479
1480 double fac = 1.0 / (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / pow(GRID, 3); /* to get potential */
1481 fac *= 1 / (2 * (Sp->FacIntToCoord * Sp->TotalMeshSize[grnr]) / GRID); /* for finite differencing */
1482
1483#ifdef PM_ZOOM_OPTIMIZED
1484 pmforce_nonperiodic_zoom_optimized_prepare_density(grnr);
1485#else
1486 pmforce_nonperiodic_uniform_optimized_prepare_density(grnr);
1487#endif
1488
1489 /* allocate the memory to hold the FFT fields */
1490 forcegrid = (fft_real *)Mem.mymalloc("forcegrid", maxfftsize * sizeof(fft_real));
1491
1492 workspace = forcegrid;
1493
1494#ifndef FFT_COLUMN_BASED
1495 fft_of_rhogrid = (fft_complex *)&rhogrid[0];
1496#else
1497 fft_of_rhogrid = (fft_complex *)&workspace[0];
1498#endif
1499
1500 /* Do the FFT of the density field */
1501#ifndef FFT_COLUMN_BASED
1502 my_slab_based_fft(&myplan, &rhogrid[0], &workspace[0], 1);
1503#else
1504 my_column_based_fft(&myplan, rhogrid, workspace, 1); /* result is in workspace, not in rhogrid ! */
1505#endif
1506
1507 /* multiply with kernel in Fourier space */
1508 /* multiply with the Fourier transform of the Green's function (kernel) */
1509
1510 /* multiply with Green's function in order to obtain the potential */
1511
1512#ifdef FFT_COLUMN_BASED
1513 for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
1514 {
1515#else
1516 for(int x = 0; x < GRID; x++)
1517 for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
1518 for(int z = 0; z < GRIDz; z++)
1519 {
1520#endif
1521
1522#ifndef FFT_COLUMN_BASED
1523 large_array_offset ip = ((large_array_offset)GRIDz) * (GRID * (y - myplan.slabstart_y) + x) + z;
1524#endif
1525
1526 double re = fft_of_rhogrid[ip][0] * fft_of_kernel[grnr][ip][0] - fft_of_rhogrid[ip][1] * fft_of_kernel[grnr][ip][1];
1527 double im = fft_of_rhogrid[ip][0] * fft_of_kernel[grnr][ip][1] + fft_of_rhogrid[ip][1] * fft_of_kernel[grnr][ip][0];
1528
1529 fft_of_rhogrid[ip][0] = re;
1530 fft_of_rhogrid[ip][1] = im;
1531 }
1532
1533 /* Do the inverse FFT to get the potential */
1534
1535#ifndef FFT_COLUMN_BASED
1536 my_slab_based_fft(&myplan, rhogrid, workspace, -1);
1537#else
1538 my_column_based_fft(&myplan, workspace, rhogrid, -1);
1539#endif
1540
1541 /* Now rhogrid holds the potential */
1542
1543#ifdef EVALPOTENTIAL
1544#ifdef PM_ZOOM_OPTIMIZED
1545 pmforce_nonperiodic_zoom_optimized_readout_forces_or_potential(grnr, -1);
1546#else
1547 pmforce_nonperiodic_uniform_optimized_readout_forces_or_potential(grnr, -1);
1548#endif
1549#endif
1550
1551 /* get the force components by finite differencing of the potential for each dimension,
1552 * and send the results back to the right CPUs
1553 */
1554 for(int dim = 2; dim >= 0; dim--) /* Calculate each component of the force. */
1555 {
1556 /* we do the x component last, because for differencing the potential in the x-direction, we need to construct the transpose */
1557#ifndef FFT_COLUMN_BASED
1558 if(dim == 0)
1559 my_slab_transposeA(&myplan, rhogrid, forcegrid); /* compute the transpose of the potential field for finite differencing */
1560
1561 for(int y = 2; y < GRID / 2 - 2; y++)
1562 for(int x = 0; x < myplan.nslab_x; x++)
1563 if(x + myplan.slabstart_x >= 2 && x + myplan.slabstart_x < GRID / 2 - 2)
1564 for(int z = 2; z < GRID / 2 - 2; z++)
1565 {
1566 int yrr = y, yll = y, yr = y, yl = y;
1567 int zrr = z, zll = z, zr = z, zl = z;
1568
1569 switch(dim)
1570 {
1571 case 0: /* note: for the x-direction, we difference the transposed direction (y) */
1572 case 1:
1573 yr = y + 1;
1574 yl = y - 1;
1575 yrr = y + 2;
1576 yll = y - 2;
1577
1578 break;
1579 case 2:
1580 zr = z + 1;
1581 zl = z - 1;
1582 zrr = z + 2;
1583 zll = z - 2;
1584
1585 break;
1586 }
1587
1588 if(dim == 0)
1589 forcegrid[TI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[TI(x, yl, zl)] - rhogrid[TI(x, yr, zr)]) -
1590 (1.0 / 6) * (rhogrid[TI(x, yll, zll)] - rhogrid[TI(x, yrr, zrr)]));
1591 else
1592 forcegrid[FI(x, y, z)] = fac * ((4.0 / 3) * (rhogrid[FI(x, yl, zl)] - rhogrid[FI(x, yr, zr)]) -
1593 (1.0 / 6) * (rhogrid[FI(x, yll, zll)] - rhogrid[FI(x, yrr, zrr)]));
1594 }
1595
1596 if(dim == 0)
1597 my_slab_transposeB(&myplan, forcegrid, rhogrid); /* reverse the transpose from above */
1598#else
1599 fft_real *scratch;
1600
1601 if(dim != 2)
1602 {
1603 scratch = (fft_real *)Mem.mymalloc("scratch", myplan.fftsize * sizeof(fft_real)); /* need a third field as scratch space */
1604 memcpy(scratch, rhogrid, myplan.fftsize * sizeof(fft_real));
1605
1606 if(dim == 1)
1607 my_fft_swap23(&myplan, scratch, forcegrid);
1608 else
1609 my_fft_swap13(&myplan, scratch, forcegrid);
1610 }
1611
1612 int ncols;
1613 if(dim == 2)
1614 ncols = myplan.ncol_XY;
1615 else if(dim == 1)
1616 ncols = myplan.ncol_XZ;
1617 else
1618 ncols = myplan.ncol_ZY;
1619
1620 for(int i = 0; i < ncols; i++)
1621 {
1622 fft_real *forcep, *potp;
1623
1624 if(dim != 2)
1625 {
1626 forcep = &scratch[GRID * i];
1627 potp = &forcegrid[GRID * i];
1628 }
1629 else
1630 {
1631 forcep = &forcegrid[GRID2 * i];
1632 potp = &rhogrid[GRID2 * i];
1633 }
1634
1635 for(int z = 2; z < GRID / 2 - 2; z++)
1636 {
1637 int zr = z + 1;
1638 int zl = z - 1;
1639 int zrr = z + 2;
1640 int zll = z - 2;
1641
1642 forcep[z] = fac * ((4.0 / 3) * (potp[zl] - potp[zr]) - (1.0 / 6) * (potp[zll] - potp[zrr]));
1643 }
1644 }
1645
1646 if(dim != 2)
1647 {
1648 if(dim == 1)
1649 my_fft_swap23back(&myplan, scratch, forcegrid);
1650 else
1651 my_fft_swap13back(&myplan, scratch, forcegrid);
1652
1653 Mem.myfree(scratch);
1654 }
1655#endif
1656
1657#ifdef PM_ZOOM_OPTIMIZED
1658 pmforce_nonperiodic_zoom_optimized_readout_forces_or_potential(grnr, dim);
1659#else
1660 pmforce_nonperiodic_uniform_optimized_readout_forces_or_potential(grnr, dim);
1661#endif
1662 }
1663
1664 /* free stuff */
1665 Mem.myfree(forcegrid);
1666 Mem.myfree(rhogrid);
1667
1668#ifdef PM_ZOOM_OPTIMIZED
1669 Mem.myfree(localfield_recvcount);
1670 Mem.myfree(localfield_offset);
1671 Mem.myfree(localfield_sendcount);
1672 Mem.myfree(localfield_first);
1673 Mem.myfree(localfield_data);
1674 Mem.myfree(localfield_globalindex);
1675 Mem.myfree(part);
1676#else
1677 Mem.myfree(partin);
1678 Mem.myfree(Rcvpm_offset);
1679 Mem.myfree(Rcvpm_count);
1680 Mem.myfree(Sndpm_offset);
1681 Mem.myfree(Sndpm_count);
1682#endif
1683
1684 double tend = Logs.second();
1685
1686 mpi_printf("PM-NONPERIODIC: done. (took %g seconds)\n", Logs.timediff(tstart, tend));
1687
1688 return 0;
1689}
1690
1694void pm_nonperiodic::pm_setup_nonperiodic_kernel(void)
1695{
1696 mpi_printf("PM-NONPERIODIC: Setting up non-periodic PM kernel(s) (GRID=%d) presently allocated=%g MB).\n", (int)GRID,
1698
1699 /* now set up kernel and its Fourier transform */
1700
1701#if !defined(PERIODIC)
1702 for(size_t i = 0; i < maxfftsize; i++) /* clear local field */
1703 kernel[0][i] = 0;
1704
1705#ifndef FFT_COLUMN_BASED
1706 for(int i = myplan.slabstart_x; i < (myplan.slabstart_x + myplan.nslab_x); i++)
1707 for(int j = 0; j < GRID; j++)
1708 {
1709#else
1710 for(int c = myplan.firstcol_XY; c < (myplan.firstcol_XY + myplan.ncol_XY); c++)
1711 {
1712 int i = c / GRID;
1713 int j = c % GRID;
1714#endif
1715 for(int k = 0; k < GRID; k++)
1716 {
1717 double xx = ((double)i) / GRID;
1718 double yy = ((double)j) / GRID;
1719 double zz = ((double)k) / GRID;
1720
1721 if(xx >= 0.5)
1722 xx -= 1.0;
1723 if(yy >= 0.5)
1724 yy -= 1.0;
1725 if(zz >= 0.5)
1726 zz -= 1.0;
1727
1728 double r = sqrt(xx * xx + yy * yy + zz * zz);
1729
1730 double u = 0.5 * r / (((double)ASMTH) / GRID);
1731
1732 double fac = 1 - erfc(u);
1733
1734#ifndef FFT_COLUMN_BASED
1735 size_t ip = FI(i - myplan.slabstart_x, j, k);
1736#else
1737 size_t ip = FC(c, k);
1738#endif
1739 if(r > 0)
1740 kernel[0][ip] = -fac / r;
1741 else
1742 kernel[0][ip] = -1 / (sqrt(M_PI) * (((double)ASMTH) / GRID));
1743 }
1744 }
1745
1746 {
1747 fft_real *workspc = (fft_real *)Mem.mymalloc("workspc", maxfftsize * sizeof(fft_real));
1748 /* Do the FFT of the kernel */
1749#ifndef FFT_COLUMN_BASED
1750 my_slab_based_fft(&myplan, kernel[0], workspc, 1);
1751#else
1752 my_column_based_fft(&myplan, kernel[0], workspc, 1); /* result is in workspace, not in kernel */
1753 memcpy(kernel[0], workspc, maxfftsize * sizeof(fft_real));
1754#endif
1755 Mem.myfree(workspc);
1756 }
1757
1758#endif
1759
1760#if defined(PLACEHIGHRESREGION)
1761
1762 for(int i = 0; i < maxfftsize; i++) /* clear local field */
1763 kernel[1][i] = 0;
1764
1765#ifndef FFT_COLUMN_BASED
1766 for(int i = myplan.slabstart_x; i < (myplan.slabstart_x + myplan.nslab_x); i++)
1767 for(int j = 0; j < GRID; j++)
1768 {
1769#else
1770 for(int c = myplan.firstcol_XY; c < (myplan.firstcol_XY + myplan.ncol_XY); c++)
1771 {
1772 int i = c / GRID;
1773 int j = c % GRID;
1774#endif
1775 for(int k = 0; k < GRID; k++)
1776 {
1777 double xx = ((double)i) / GRID;
1778 double yy = ((double)j) / GRID;
1779 double zz = ((double)k) / GRID;
1780
1781 if(xx >= 0.5)
1782 xx -= 1.0;
1783 if(yy >= 0.5)
1784 yy -= 1.0;
1785 if(zz >= 0.5)
1786 zz -= 1.0;
1787
1788 double r = sqrt(xx * xx + yy * yy + zz * zz);
1789
1790 double u = 0.5 * r / (((double)ASMTH) / GRID);
1791
1792 double fac = erfc(u * Sp->Asmth[1] / Sp->Asmth[0]) - erfc(u);
1793
1794#ifndef FFT_COLUMN_BASED
1795 size_t ip = FI(i - myplan.slabstart_x, j, k);
1796#else
1797 size_t ip = FC(c, k);
1798#endif
1799
1800 if(r > 0)
1801 kernel[1][ip] = -fac / r;
1802 else
1803 {
1804 fac = 1 - Sp->Asmth[1] / Sp->Asmth[0];
1805 kernel[1][ip] = -fac / (sqrt(M_PI) * (((double)ASMTH) / GRID));
1806 }
1807 }
1808#ifndef FFT_COLUMN_BASED
1809 }
1810#else
1811 }
1812#endif
1813
1814 {
1815 fft_real *workspc = (fft_real *)Mem.mymalloc("workspc", maxfftsize * sizeof(fft_real));
1816 /* Do the FFT of the kernel */
1817#ifndef FFT_COLUMN_BASED
1818 my_slab_based_fft(&myplan, kernel[1], workspc, 1);
1819#else
1820 my_column_based_fft(&myplan, kernel[1], workspc, 1); /* result is in workspace, not in kernel */
1821 memcpy(kernel[1], workspc, maxfftsize * sizeof(fft_real));
1822#endif
1823 Mem.myfree(workspc);
1824 }
1825
1826#endif
1827
1828 /* deconvolve the Greens function twice with the CIC kernel */
1829#ifdef FFT_COLUMN_BASED
1830
1831 for(large_array_offset ip = 0; ip < myplan.second_transposed_ncells; ip++)
1832 {
1833 large_array_offset ipcell = ip + myplan.transposed_firstcol * GRID;
1834 int y = ipcell / (GRID * GRIDz);
1835 int yr = ipcell % (GRID * GRIDz);
1836 int z = yr / GRID;
1837 int x = yr % GRID;
1838#else
1839 for(int x = 0; x < GRID; x++)
1840 for(int y = myplan.slabstart_y; y < myplan.slabstart_y + myplan.nslab_y; y++)
1841 for(int z = 0; z < GRIDz; z++)
1842 {
1843#endif
1844 double kx, ky, kz;
1845
1846 if(x > GRID / 2)
1847 kx = x - GRID;
1848 else
1849 kx = x;
1850 if(y > GRID / 2)
1851 ky = y - GRID;
1852 else
1853 ky = y;
1854 if(z > GRID / 2)
1855 kz = z - GRID;
1856 else
1857 kz = z;
1858
1859 double k2 = kx * kx + ky * ky + kz * kz;
1860
1861 if(k2 > 0)
1862 {
1863 double fx = 1, fy = 1, fz = 1;
1864
1865 if(kx != 0)
1866 {
1867 fx = (M_PI * kx) / GRID;
1868 fx = sin(fx) / fx;
1869 }
1870 if(ky != 0)
1871 {
1872 fy = (M_PI * ky) / GRID;
1873 fy = sin(fy) / fy;
1874 }
1875 if(kz != 0)
1876 {
1877 fz = (M_PI * kz) / GRID;
1878 fz = sin(fz) / fz;
1879 }
1880
1881 double ff = 1 / (fx * fy * fz);
1882 ff = ff * ff * ff * ff;
1883
1884#ifndef FFT_COLUMN_BASED
1885 large_array_offset ip = ((large_array_offset)GRIDz) * (GRID * (y - myplan.slabstart_y) + x) + z;
1886#endif
1887#if !defined(PERIODIC)
1888 fft_of_kernel[0][ip][0] *= ff;
1889 fft_of_kernel[0][ip][1] *= ff;
1890#endif
1891#if defined(PLACEHIGHRESREGION)
1892 fft_of_kernel[1][ip][0] *= ff;
1893 fft_of_kernel[1][ip][1] *= ff;
1894#endif
1895 }
1896 }
1897
1898 /* end deconvolution */
1899}
1900
1901#endif
global_data_all_processes All
Definition: main.cc:40
double timediff(double t0, double t1)
Definition: logs.cc:488
double second(void)
Definition: logs.cc:471
double getAllocatedBytesInMB(void)
Definition: mymalloc.h:71
void my_column_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
void my_column_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
void my_fft_swap13back(fft_plan *plan, fft_real *data, fft_real *out)
void my_fft_swap13(fft_plan *plan, fft_real *data, fft_real *out)
void my_slab_transposeB(fft_plan *plan, fft_real *field, fft_real *scratch)
void my_slab_based_fft(fft_plan *plan, void *data, void *workspace, int forward)
void my_fft_swap23(fft_plan *plan, fft_real *data, fft_real *out)
void my_fft_swap23back(fft_plan *plan, fft_real *data, fft_real *out)
void my_slab_transposeA(fft_plan *plan, fft_real *field, fft_real *scratch)
void my_slab_based_fft_init(fft_plan *plan, int NgridX, int NgridY, int NgridZ)
void mpi_printf(const char *fmt,...)
Definition: setcomm.h:55
int ThisTask
Definition: setcomm.h:33
int NTask
Definition: setcomm.h:32
int PTask
Definition: setcomm.h:34
MPI_Comm Communicator
Definition: setcomm.h:31
#define RCUT
Definition: constants.h:293
#define LOW_MESH
Definition: constants.h:33
#define HIGH_MESH
Definition: constants.h:34
#define MPI_MESSAGE_SIZELIMIT_IN_BYTES
Definition: constants.h:53
#define M_PI
Definition: constants.h:56
#define ASMTH
Definition: constants.h:287
double mycxxsort(T *begin, T *end, Tcomp comp)
Definition: cxxsort.h:39
uint32_t MyIntPosType
Definition: dtypes.h:35
int32_t MySignedIntPosType
Definition: dtypes.h:36
#define BITS_FOR_POSITIONS
Definition: dtypes.h:37
logs Logs
Definition: main.cc:43
#define Terminate(...)
Definition: macros.h:15
MPI_Datatype MPI_MyIntPosType
Definition: mpi_vars.cc:24
MPI_Op MPI_MAX_MySignedIntPosType
Definition: mpi_vars.cc:29
MPI_Op MPI_MIN_MySignedIntPosType
Definition: mpi_vars.cc:28
int myMPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
Definition: myalltoall.cc:301
int myMPI_Sendrecv(void *sendbuf, size_t sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf, size_t recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status)
#define TAG_NONPERIOD_B
Definition: mpi_utils.h:45
#define TAG_NONPERIOD_C
Definition: mpi_utils.h:46
void myMPI_Alltoallv(void *sendbuf, size_t *sendcounts, size_t *sdispls, void *recvbuf, size_t *recvcounts, size_t *rdispls, int len, int big_flag, MPI_Comm comm)
Definition: myalltoall.cc:181
#define TAG_NONPERIOD_A
Definition: mpi_utils.h:44
memory Mem
Definition: main.cc:44
expr sin(half arg)
Definition: half.hpp:2816
expr pow(half base, half exp)
Definition: half.hpp:2803
expr erfc(half arg)
Definition: half.hpp:2925
expr sqrt(half arg)
Definition: half.hpp:2777
#define FFTW(x)
Definition: pm_mpi_fft.h:22
integertime Ti_Current
Definition: allvars.h:188
MyDouble getMass(void)
vector< MyFloat > GravAccel
Definition: particle_data.h:55
MyIntPosType IntPos[3]
Definition: particle_data.h:53
void subdivide_evenly(long long N, int pieces, int index_bin, long long *first, int *count)
Definition: system.cc:51