SIONlib  1.7.4
Scalable I/O library for parallel access to task-local files
partest_split_comm.c
1 /****************************************************************************
2 ** SIONLIB http://www.fz-juelich.de/jsc/sionlib **
3 *****************************************************************************
4 ** Copyright (c) 2008-2019 **
5 ** Forschungszentrum Juelich, Juelich Supercomputing Centre **
6 ** **
7 ** See the file COPYRIGHT in the package base directory for details **
8 ****************************************************************************/
9 
10 #define _XOPEN_SOURCE 700
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <mpi.h>
18 #include <time.h>
19 #include <math.h>
20 
21 #include "partest_split_comm.h"
22 #include "partest_util.h"
23 
24 #ifdef _SION_BGQ
25 #include <firmware/include/personality.h>
26 #include <spi/include/kernel/process.h>
27 #include <spi/include/kernel/location.h>
28 #ifdef __GNUC__
29 #include <hwi/include/bqc/A2_inlines.h> /* for GetTimebase() */
30 #endif
31 #include <hwi/include/common/uci.h>
32 #include <mpix.h>
33 #endif
34 
35 #ifdef _SION_BGP
36 #include <common/bgp_personality.h>
37 #include <common/bgp_personality_inlines.h>
38 #endif
39 
40 #ifdef _SION_FX
41 #include <mpi-ext.h>
42 #endif
43 
44 #ifdef _SION_AIX
45 #include <unistd.h>
46 #endif
47 
48 #ifdef _SION_LINUX
49 #include <unistd.h>
50 #endif
51 
52 
53 int split_communicator(_test_communicators * communicators, int bluegene, int bluegene_np, int bluegene_sort, int numfiles, int read_task_offset, int verbose)
54 {
55  int proc_per_file;
56 
57  communicators->work_size = communicators->work_rank = -2;
58  communicators->local_size = communicators->local_rank = -2;
59 
60 
61 
62 #ifdef _SION_BGP
63  if (bluegene) { /* order MPI-tasks by I/O-node */
64  _BGP_Personality_t personality;
65  MPI_Comm commSame, commDiff;
66  int sizeSame, sizeDiff;
67  int rankSame, rankDiff;
68  char location[128];
69  unsigned procid, x, y, z, t;
70  char cbuffer[MAXCHARLEN];
71 
72  /* get location information */
73  Kernel_GetPersonality(&personality, sizeof(personality));
74  BGP_Personality_getLocationString(&personality, location);
75  procid = Kernel_PhysicalProcessorID();
76  MPIX_rank2torus(communicators->all_rank, &x, &y, &z, &t);
77 
78  /* task of communicator working with different I/O-nodes */
79  MPIX_Pset_diff_comm_create(&commDiff);
80  MPI_Comm_size(commDiff, &sizeDiff);
81  MPI_Comm_rank(commDiff, &rankDiff);
82  communicators->ionode_number = rankDiff;
83 
84  /* communicator consists of all task working with the same I/O-node */
85  MPIX_Pset_same_comm_create(&commSame);
86  MPI_Comm_size(commSame, &sizeSame);
87  MPI_Comm_rank(commSame, &rankSame);
88 
89  /* if -p not specified all proc will write! */
90  if (bluegene_np == 0) {
91  bluegene_np = sizeSame;
92  }
93 
94  /* Get a communicator with all writing tasks => new global communicator */
95  MPI_Comm_split(communicators->all, (rankSame < bluegene_np), communicators->all_rank, &communicators->work);
96  MPI_Comm_size(communicators->work, &communicators->work_size);
97  MPI_Comm_rank(communicators->work, &communicators->work_rank);
98  if (rankSame >= bluegene_np) {
99  /* not working task */
100  communicators->work_size = communicators->work_rank = -1;
101  communicators->local_size = communicators->local_rank = -1;
102  }
103 
104  /* If only one file will be used => dont split further */
105  /* if numfile > 1 sion will generate correct local communicator */
106  if (numfiles >= 1) {
107  communicators->local = communicators->work;
108  }
109  else if(numfiles<0) {
110  if(numfiles==-1) {
111  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
112  MPI_Comm_split(commSame, (rankSame < bluegene_np), communicators->all_rank, &communicators->local);
113  } else {
114  /* local communicator contains only one task per IO-node */
115  /* bluegene_np has to be 512 */
116  communicators->local=commDiff;
117  }
118  }
119  MPI_Comm_size(communicators->local, &communicators->local_size);
120  MPI_Comm_rank(communicators->local, &communicators->local_rank);
121 
122  /* determine filenumber */
123  if (numfiles < 1) {
124  /* one file per I/O-node */
125  if(numfiles==-1) communicators->file_number = rankDiff;
126  else communicators->file_number = rankSame;
127  }
128  else {
129  communicators->file_number = -1;
130  }
131 
132  /* print log message about location, ... */
133  sprintf(cbuffer, "");
134  if (rankSame < bluegene_np) {
135  if (verbose) {
136  sprintf(cbuffer, "BGP[%05d] diff_comm: %4d of %4d same_comm: %5d of %5d file_comm: %5d of %5d %s phys_xyzt(%ud,%ud,%ud,%ud)\n",
137  communicators->all_rank, rankDiff + 1, sizeDiff, rankSame + 1, sizeSame, communicators->local_rank + 1, communicators->local_size,
138  location, x, y, z, t);
139  }
140  }
141  collective_print_gather(cbuffer, communicators->work);
142 
143  }
144 #endif
145 
146 #ifdef _SION_BGQ
147  if (bluegene) { /* order MPI-tasks by I/O-node */
148  Personality_t personality;
149  MPI_Comm commSame, commDiff;
150  MPIX_Hardware_t hw;
151  int sizeSame, sizeDiff;
152  int rankSame, rankDiff;
153  int baserank;
154  int factor, bridgeid,core, hwthread,procid;
155  int dist_to_bridge, isonbridge;
156  char cbuffer[MAXCHARLEN];
157  char location[64];
158  BG_UniversalComponentIdentifier uci;
159  unsigned int row, col, mp, nb, cc;
160  double starttime;
161  int key, color, baseid;
162 
163  if(communicators->all_rank==0) {
164  starttime=MPI_Wtime();
165  printf("partest_split_comm[%d]: starting at Wt=%10.3fs\n",communicators->all_rank,starttime);
166  }
167 
168  /* get location information */
169  Kernel_GetPersonality(&personality, sizeof(Personality_t));
170  MPIX_Hardware(&hw);
171  uci = personality.Kernel_Config.UCI;
172  bg_decodeComputeCardOnNodeBoardUCI(uci, &row, &col, &mp, &nb, &cc);
173 
174  procid = Kernel_ProcessorID(); /* 0-63 */
175  core = Kernel_ProcessorCoreID(); /* 0-15 */
176  hwthread = Kernel_ProcessorThreadID(); /* 0-3 */
177 
178  sprintf(location, "R%x%x-M%ud-N%02x-J%02x <%d,%d,%d,%d,%d> p%02dc%02dt%1d", row, col, mp, nb, cc,
179  personality.Network_Config.Acoord, personality.Network_Config.Bcoord,
180  personality.Network_Config.Ccoord, personality.Network_Config.Dcoord,
181  personality.Network_Config.Ecoord,
182  procid,core,hwthread);
183 
184  if (
185  ( personality.Network_Config.Acoord==personality.Network_Config.cnBridge_A ) &&
186  ( personality.Network_Config.Bcoord==personality.Network_Config.cnBridge_B ) &&
187  ( personality.Network_Config.Ccoord==personality.Network_Config.cnBridge_C ) &&
188  ( personality.Network_Config.Dcoord==personality.Network_Config.cnBridge_D ) &&
189  ( personality.Network_Config.Ecoord==personality.Network_Config.cnBridge_E )
190  )
191  {
192  isonbridge=1;
193  } else {
194  isonbridge=0;
195  }
196 
197  dist_to_bridge=MPIX_IO_distance();
198 
199  /* following could be replaced by MPIX_IO_link_id() */
200  factor=1;
201  bridgeid = personality.Network_Config.cnBridge_E;
202  factor *= personality.Network_Config.Enodes;
203  bridgeid += personality.Network_Config.cnBridge_D*factor;
204  factor *= personality.Network_Config.Dnodes;
205  bridgeid += personality.Network_Config.cnBridge_C*factor;
206  factor *= personality.Network_Config.Cnodes;
207  bridgeid += personality.Network_Config.cnBridge_B*factor;
208  factor *= personality.Network_Config.Bnodes;
209  bridgeid += personality.Network_Config.cnBridge_A*factor;
210 
211 
212  if(bluegene==2) {
213  /* per ION */
214  /* communicator consists of all task working with the same I/O-bridge */
215  if(bluegene_sort==0) {
216  key = MPIX_IO_distance();
217  } else {
218  key = communicators->all_rank;
219  }
220 
221  MPI_Comm_split(communicators->all, bridgeid, key, &commSame);
222  MPI_Comm_size(commSame, &sizeSame);
223  MPI_Comm_rank(commSame, &rankSame);
224  baseid=bridgeid;
225 
226  /* communicator consists of all task working with the different I/O-nodes */
227  MPI_Comm_split(communicators->all, rankSame, bridgeid, &commDiff);
228  MPI_Comm_size(commDiff, &sizeDiff);
229  MPI_Comm_rank(commDiff, &rankDiff);
230 
231  communicators->ionode_number = rankDiff;
232  } else {
233  /* similar to MPIX_Pset_same_comm_create but with ION-id */
234  MPI_Comm commSameIOB;
235  int sizeSameIOB;
236  int rankSameIOB;
237  int ionid;
238 
239  /* only needed for ionid distribution */
240  key = communicators->all_rank;
241  MPI_Comm_split(communicators->all, bridgeid, key, &commSameIOB);
242  MPI_Comm_size(commSameIOB, &sizeSameIOB);
243  MPI_Comm_rank(commSameIOB, &rankSameIOB);
244 
245  if(communicators->all_rank==0) {
246  printf("partest_split_comm[%d]: after split commSameIOB deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
247  }
248 
249  if(rankSameIOB==0) {
250  /* ionid=MPIX_IO_node_id() & 0xFFFF; */
251  ionid=MPIX_IO_node_id();
252  }
253  MPI_Bcast(&ionid, 1, MPI_INT, 0, commSameIOB);
254 
255  if(communicators->all_rank==0) {
256  printf("partest_split_comm[%d]: after get ionid deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
257  }
258 
259  color = (int) ionid;
260  if(bluegene_sort==0) {
261  key = MPIX_IO_distance();
262  } else {
263  key = communicators->all_rank;
264  }
265  int rc=0;
266  rc=MPI_Comm_split(communicators->all,color,key,&commSame);
267  MPI_Comm_size(commSame, &sizeSame);
268  MPI_Comm_rank(commSame, &rankSame);
269  baseid=ionid;
270 
271  if(communicators->all_rank==0) {
272  printf("partest_split_comm[%d]: after split commSame deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
273  }
274 
275  /* distribute global rank of first task in samecomm */
276  if(rankSame==0) baserank=communicators->all_rank;
277  MPI_Bcast(&baserank, 1, MPI_INT, 0, commSame);
278 
279  if(communicators->all_rank==0) {
280  printf("partest_split_comm[%d]: after bcast baserank deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
281  }
282 
283  /* similar to MPIX_Pset_diff_comm_create but with ION-id, but without: ... *hw.ppn+hw.coreID */
284  color = rankSame;
285  key = baserank; /* rank of task 0 of samecomm in MPI_COMM_WORLD */
286  MPI_Comm_split(communicators->all,color,key,&commDiff);
287  MPI_Comm_size(commDiff, &sizeDiff);
288  MPI_Comm_rank(commDiff, &rankDiff);
289 
290  if(communicators->all_rank==0) {
291  printf("partest_split_comm[%d]: after split commDiff deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
292  }
293 
294  }
295 
296  /* printf("WF: %02d of %02d here rSame=%2d rDiff=%2d bg_np=%2d --> factor=%2d, bridgeid=%2d %s -> bridge(%d,%d,%d,%d,%d)\n",communicators->all_rank,communicators->all_size, rankSame, rankDiff, bluegene_np, factor, bridgeid,location, */
297  /* personality.Network_Config.cnBridge_A, personality.Network_Config.cnBridge_B, personality.Network_Config.cnBridge_C, personality.Network_Config.cnBridge_D, personality.Network_Config.cnBridge_E); */
298 
299  /* if -p not specified all proc will write! */
300  if (bluegene_np == 0) {
301  bluegene_np = sizeSame;
302  }
303 
304  /* Get a communicator with all writing tasks => new global communicator */
305  /* TODO: better to MPI_UNDEFINED when rankSame >= bluegene_np */
306  MPI_Comm_split(communicators->all, (rankSame < bluegene_np), communicators->all_rank, &communicators->work);
307  MPI_Comm_size(communicators->work, &communicators->work_size);
308  MPI_Comm_rank(communicators->work, &communicators->work_rank);
309  if (rankSame >= bluegene_np) {
310  /* not working task */
311  communicators->work_size = communicators->work_rank = -1;
312  communicators->local_size = communicators->local_rank = -1;
313  }
314  if(communicators->all_rank==0) {
315  printf("partest_split_comm[%d]: after split work deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
316  }
317 
318  /* If only one file will be used => dont split further */
319  /* if numfile > 1 sion will generate correct local communicator */
320  if (numfiles >= 1) {
321  communicators->local = communicators->work;
322  }
323  else if(numfiles<0) {
324  if(numfiles==-1) {
325 #ifdef SPLITCASCADE
326  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
327  MPI_Comm_split(commSame, (rankSame < bluegene_np), rankSame, &communicators->local);
328 #else
329  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
330  color=(rankSame < bluegene_np)?baseid:MPI_UNDEFINED;
331  MPI_Comm_split(communicators->all, color, rankSame, &communicators->local);
332 #endif
333  if (rankSame < bluegene_np) {
334  MPI_Comm_size(communicators->local, &communicators->local_size);
335  MPI_Comm_rank(communicators->local, &communicators->local_rank);
336  }
337 
338  if(communicators->all_rank==0) {
339  printf("partest_split_comm[%d]: after split local deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
340  }
341 
342  } else {
343  /* local communicator contains only one task per IO-node */
344  /* bluegene_np has to be 512 */
345  communicators->local=commDiff;
346  MPI_Comm_size(communicators->local, &communicators->local_size);
347  MPI_Comm_rank(communicators->local, &communicators->local_rank);
348  }
349  }
350 
351  /* determine filenumber */
352  if (numfiles < 1) {
353  /* one file per I/O-node */
354  if(numfiles==-1) communicators->file_number = rankDiff;
355  else communicators->file_number = rankSame;
356  }
357  else {
358  communicators->file_number = -1;
359  }
360 
361  if(communicators->all_rank==0) {
362  printf("partest_split_comm[%d]: before print verbose deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
363  }
364  /* print log message about location, ... */
365  if (verbose) {
366  sprintf(cbuffer, "");
367  if (rankSame < bluegene_np) {
368  sprintf(cbuffer, "BGQ[%05d] diff_comm: %4d of %4d same_comm: %5d of %5d file_comm: %5d of %5d %s bridge=%d, dist=%d\n",
369  communicators->all_rank, rankDiff + 1, sizeDiff, rankSame + 1, sizeSame, communicators->local_rank + 1,
370  communicators->local_size, location,isonbridge,dist_to_bridge);
371  }
372  collective_print_gather(cbuffer, communicators->work);
373  }
374  if(communicators->all_rank==0) {
375  printf("partest_split_comm[%d]: after print verbose/end deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
376  }
377 
378  } /* if (bluegene) */
379 #endif
380 
381 #ifdef _SION_AIX
382  /* no communicator adjustment */
383 #endif
384 
385 #ifdef _SION_DARWIN
386  if (verbose) {
387  char location[256];
388  gethostname(location, 256);
389  char cbuffer[MAXCHARLEN];
390  sprintf(cbuffer, "DARWIN[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
391  communicators->all_rank, communicators->all_rank, communicators->all_size,
392  communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
393  collective_print_gather(cbuffer, communicators->all);
394  }
395 
396 #endif
397 
398 #if defined( _SION_LINUX) && (!defined(_SION_FX))
399  if (verbose) {
400  char location[256];
401  gethostname(location, 256);
402  char cbuffer[MAXCHARLEN];
403  sprintf(cbuffer, "LINUX[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
404  communicators->all_rank, communicators->all_rank, communicators->all_size,
405  communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
406  collective_print_gather(cbuffer, communicators->all);
407  }
408 
409 #endif
410 
411 #ifdef _SION_AIX
412  if (verbose) {
413  char location[256];
414  gethostname(location, 256);
415  int sizeSame = 0, sizeDiff = 0;
416  char cbuffer[MAXCHARLEN];
417  sprintf(cbuffer, "AIX[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
418  communicators->all_rank, communicators->all_rank, communicators->all_size,
419  communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
420  collective_print_gather(cbuffer, communicators->all);
421  }
422 
423 #endif
424 
425 #ifdef _SION_FX
426  if (bluegene) { /* order MPI-tasks by I/O-node */
427  int rank, x, y, z, a, b, c, rc;
428  char location[256];
429  char cbuffer[MAXCHARLEN];
430  int ionodeid;
431  MPI_Comm commSame, commDiff;
432  int sizeSame = 0, sizeDiff;
433  int rankSame = 0, rankDiff;
434 
435  gethostname(location, 256);
436  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
437  rc=FJMPI_Topology_sys_rank2xyzabc(rank, &x, &y, &z, &a, &b, &c);
438 
439  ionodeid=x * 65536 + y;
440 
441  if(bluegene>0) {
442  /* per ION */
443  /* communicator consists of all task working with the same I/O-node */
444  MPI_Comm_split(communicators->all, ionodeid, communicators->all_rank, &commSame);
445  MPI_Comm_size(commSame, &sizeSame);
446  MPI_Comm_rank(commSame, &rankSame);
447 
448  /* communicator consists of all task working with the different I/O-nodes */
449  MPI_Comm_split(communicators->all, rankSame, ionodeid, &commDiff);
450  MPI_Comm_size(commDiff, &sizeDiff);
451  MPI_Comm_rank(commDiff, &rankDiff);
452 
453  communicators->ionode_number = rankDiff;
454 
455  } else {
456  bluegene_np = sizeSame;
457  }
458 
459  /* Get a communicator with all writing tasks => new global communicator */
460  MPI_Comm_split(communicators->all, (rankSame < bluegene_np), communicators->all_rank, &communicators->work);
461  MPI_Comm_size(communicators->work, &communicators->work_size);
462  MPI_Comm_rank(communicators->work, &communicators->work_rank);
463  if (rankSame >= bluegene_np) {
464  /* not working task */
465  communicators->work_size = communicators->work_rank = -1;
466  communicators->local_size = communicators->local_rank = -1;
467  }
468 
469  /* If only one file will be used => dont split further */
470  /* if numfile > 1 sion will generate correct local communicator */
471  if (numfiles >= 1) {
472  communicators->local = communicators->work;
473  }
474  else if(numfiles<0) {
475  if(numfiles==-1) {
476  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
477  MPI_Comm_split(commSame, (rankSame < bluegene_np), communicators->all_rank, &communicators->local);
478  } else {
479  /* local communicator contains only one task per IO-node */
480  /* bluegene_np has to be 512 */
481  communicators->local=commDiff;
482  }
483  }
484  MPI_Comm_size(communicators->local, &communicators->local_size);
485  MPI_Comm_rank(communicators->local, &communicators->local_rank);
486 
487  /* determine filenumber */
488  if (numfiles < 1) {
489  /* one file per I/O-node */
490  if(numfiles==-1) communicators->file_number = rankDiff;
491  else communicators->file_number = rankSame;
492  }
493  else {
494  communicators->file_number = -1;
495  }
496 
497  /* print log message about location, ... */
498  sprintf(cbuffer, "");
499  if (rankSame < bluegene_np) {
500  sprintf(cbuffer, "FX[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %dx%dx%d %dx%dx%d %s ioid=%d ion=%d rc=%d numfiles=%d\n",
501 
502  communicators->all_rank, rankDiff, sizeDiff, rankSame, sizeSame,
503  communicators->local_rank, communicators->local_size,
504  x,y,z,a,b,c, location, ionodeid, communicators->ionode_number, rc, numfiles);
505  }
506  collective_print_gather(cbuffer, communicators->all);
507 
508  }
509 #endif
510 
511 
512  /* initial set of communicators not changed? */
513  if (communicators->work_size == -2) {
514  /* all task do work */
515  communicators->work = communicators->all;
516  MPI_Comm_size(communicators->work, &communicators->work_size);
517  MPI_Comm_rank(communicators->work, &communicators->work_rank);
518  }
519  /* local communicators */
520  if (communicators->local_size == -2) {
521  if (numfiles == 1) {
522  communicators->local = communicators->work;
523  }
524  /* set a default distribution on files, will be computed again by sion_open */
525  if (numfiles < 1) {
526  numfiles = communicators->work_size / 2;
527  if (numfiles == 0)
528  numfiles = 1;
529  }
530  proc_per_file = communicators->work_size / numfiles;
531 
532  /* remaining tasks are write/read to/from the last file */
533  if (communicators->work_rank >= (numfiles * proc_per_file)) {
534  communicators->file_number = numfiles - 1;
535  }
536  else {
537  communicators->file_number = communicators->work_rank / proc_per_file;
538  }
539 
540  MPI_Comm_split(communicators->work, communicators->file_number, communicators->all_rank, &communicators->local);
541 
542  MPI_Comm_size(communicators->local, &communicators->local_size);
543  MPI_Comm_rank(communicators->local, &communicators->local_rank);
544 
545  communicators->ionode_number = communicators->file_number;
546 
547  }
548 
549  /* shift working tasks */
550  if (communicators->work_size != -1) {
551  /* only if task in communicator work */
552  int newtasknr;
553  newtasknr=(communicators->work_rank+read_task_offset)%communicators->work_size;
554  MPI_Comm_split(communicators->work, 0, newtasknr, &communicators->workread);
555 
556  MPI_Comm_size(communicators->workread, &communicators->workread_size);
557  MPI_Comm_rank(communicators->workread, &communicators->workread_rank);
558  /* printf("WF: %d %d %% %d-> %d (%d %d)\n",
559  communicators->work_rank,read_task_offset,
560  communicators->work_size,newtasknr,
561  communicators->workread_rank,communicators->workread_size);*/
562 
563  } else {
564  /* this rtask will not be used for reading */
565  communicators->workread_size = communicators->workread_rank = -1;
566  communicators->local_size = communicators->local_rank = -1;
567  }
568 
569  return(1);
570 }
571