SIONlib  1.7.1
Scalable I/O library for parallel access to task-local files
partest_split_comm.c
1 /****************************************************************************
2 ** SIONLIB http://www.fz-juelich.de/jsc/sionlib **
3 *****************************************************************************
4 ** Copyright (c) 2008-2016 **
5 ** Forschungszentrum Juelich, Juelich Supercomputing Centre **
6 ** **
7 ** See the file COPYRIGHT in the package base directory for details **
8 ****************************************************************************/
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <ctype.h>
13 #include <unistd.h>
14 #include <mpi.h>
15 #include <time.h>
16 #include <math.h>
17 
18 #include "partest_split_comm.h"
19 #include "partest_util.h"
20 
21 #ifdef _SION_BGQ
22 #include <firmware/include/personality.h>
23 #include <spi/include/kernel/process.h>
24 #include <spi/include/kernel/location.h>
25 #ifdef __GNUC__
26 #include <hwi/include/bqc/A2_inlines.h> /* for GetTimebase() */
27 #endif
28 #include <hwi/include/common/uci.h>
29 #include <mpix.h>
30 #endif
31 
32 #ifdef _SION_BGP
33 #include <common/bgp_personality.h>
34 #include <common/bgp_personality_inlines.h>
35 #endif
36 
37 #ifdef _SION_FX
38 #include <mpi-ext.h>
39 #endif
40 
41 #ifdef _SION_XT
42 #include <unistd.h>
43 #include <pmi.h>
44 #include <rca_lib.h>
45 #endif
46 
47 #ifdef _SION_AIX
48 #include <unistd.h>
49 #endif
50 
51 #ifdef _SION_LINUX
52 #include <unistd.h>
53 #endif
54 
55 
56 int split_communicator(_test_communicators * communicators, int bluegene, int bluegene_np, int bluegene_sort, int numfiles, int read_task_offset, int verbose)
57 {
58  int proc_per_file;
59 
60  communicators->work_size = communicators->work_rank = -2;
61  communicators->local_size = communicators->local_rank = -2;
62 
63 
64 
65 #ifdef _SION_BGP
66  if (bluegene) { /* order MPI-tasks by I/O-node */
67  _BGP_Personality_t personality;
68  MPI_Comm commSame, commDiff;
69  int sizeSame, sizeDiff;
70  int rankSame, rankDiff;
71  char location[128];
72  unsigned procid, x, y, z, t;
73  char cbuffer[MAXCHARLEN];
74 
75  /* get location information */
76  Kernel_GetPersonality(&personality, sizeof(personality));
77  BGP_Personality_getLocationString(&personality, location);
78  procid = Kernel_PhysicalProcessorID();
79  MPIX_rank2torus(communicators->all_rank, &x, &y, &z, &t);
80 
81  /* task of communicator working with different I/O-nodes */
82  MPIX_Pset_diff_comm_create(&commDiff);
83  MPI_Comm_size(commDiff, &sizeDiff);
84  MPI_Comm_rank(commDiff, &rankDiff);
85  communicators->ionode_number = rankDiff;
86 
87  /* communicator consists of all task working with the same I/O-node */
88  MPIX_Pset_same_comm_create(&commSame);
89  MPI_Comm_size(commSame, &sizeSame);
90  MPI_Comm_rank(commSame, &rankSame);
91 
92  /* if -p not specified all proc will write! */
93  if (bluegene_np == 0) {
94  bluegene_np = sizeSame;
95  }
96 
97  /* Get a communicator with all writing tasks => new global communicator */
98  MPI_Comm_split(communicators->all, (rankSame < bluegene_np), communicators->all_rank, &communicators->work);
99  MPI_Comm_size(communicators->work, &communicators->work_size);
100  MPI_Comm_rank(communicators->work, &communicators->work_rank);
101  if (rankSame >= bluegene_np) {
102  /* not working task */
103  communicators->work_size = communicators->work_rank = -1;
104  communicators->local_size = communicators->local_rank = -1;
105  }
106 
107  /* If only one file will be used => dont split further */
108  /* if numfile > 1 sion will generate correct local communicator */
109  if (numfiles >= 1) {
110  communicators->local = communicators->work;
111  }
112  else if(numfiles<0) {
113  if(numfiles==-1) {
114  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
115  MPI_Comm_split(commSame, (rankSame < bluegene_np), communicators->all_rank, &communicators->local);
116  } else {
117  /* local communicator contains only one task per IO-node */
118  /* bluegene_np has to be 512 */
119  communicators->local=commDiff;
120  }
121  }
122  MPI_Comm_size(communicators->local, &communicators->local_size);
123  MPI_Comm_rank(communicators->local, &communicators->local_rank);
124 
125  /* determine filenumber */
126  if (numfiles < 1) {
127  /* one file per I/O-node */
128  if(numfiles==-1) communicators->file_number = rankDiff;
129  else communicators->file_number = rankSame;
130  }
131  else {
132  communicators->file_number = -1;
133  }
134 
135  /* print log message about location, ... */
136  sprintf(cbuffer, "");
137  if (rankSame < bluegene_np) {
138  if (verbose) {
139  sprintf(cbuffer, "BGP[%05d] diff_comm: %4d of %4d same_comm: %5d of %5d file_comm: %5d of %5d %s phys_xyzt(%ud,%ud,%ud,%ud)\n",
140  communicators->all_rank, rankDiff + 1, sizeDiff, rankSame + 1, sizeSame, communicators->local_rank + 1, communicators->local_size,
141  location, x, y, z, t);
142  }
143  }
144  collective_print_gather(cbuffer, communicators->work);
145 
146  }
147 #endif
148 
149 #ifdef _SION_BGQ
150  if (bluegene) { /* order MPI-tasks by I/O-node */
151  Personality_t personality;
152  MPI_Comm commSame, commDiff;
153  MPIX_Hardware_t hw;
154  int sizeSame, sizeDiff;
155  int rankSame, rankDiff;
156  int baserank;
157  int factor, bridgeid,core, hwthread,procid;
158  int dist_to_bridge, isonbridge;
159  char cbuffer[MAXCHARLEN];
160  char location[64];
161  BG_UniversalComponentIdentifier uci;
162  unsigned int row, col, mp, nb, cc;
163  double starttime;
164  int key, color, baseid;
165 
166  if(communicators->all_rank==0) {
167  starttime=MPI_Wtime();
168  printf("partest_split_comm[%d]: starting at Wt=%10.3fs\n",communicators->all_rank,starttime);
169  }
170 
171  /* get location information */
172  Kernel_GetPersonality(&personality, sizeof(Personality_t));
173  MPIX_Hardware(&hw);
174  uci = personality.Kernel_Config.UCI;
175  bg_decodeComputeCardOnNodeBoardUCI(uci, &row, &col, &mp, &nb, &cc);
176 
177  procid = Kernel_ProcessorID(); /* 0-63 */
178  core = Kernel_ProcessorCoreID(); /* 0-15 */
179  hwthread = Kernel_ProcessorThreadID(); /* 0-3 */
180 
181  sprintf(location, "R%x%x-M%ud-N%02x-J%02x <%d,%d,%d,%d,%d> p%02dc%02dt%1d", row, col, mp, nb, cc,
182  personality.Network_Config.Acoord, personality.Network_Config.Bcoord,
183  personality.Network_Config.Ccoord, personality.Network_Config.Dcoord,
184  personality.Network_Config.Ecoord,
185  procid,core,hwthread);
186 
187  if (
188  ( personality.Network_Config.Acoord==personality.Network_Config.cnBridge_A ) &&
189  ( personality.Network_Config.Bcoord==personality.Network_Config.cnBridge_B ) &&
190  ( personality.Network_Config.Ccoord==personality.Network_Config.cnBridge_C ) &&
191  ( personality.Network_Config.Dcoord==personality.Network_Config.cnBridge_D ) &&
192  ( personality.Network_Config.Ecoord==personality.Network_Config.cnBridge_E )
193  )
194  {
195  isonbridge=1;
196  } else {
197  isonbridge=0;
198  }
199 
200  dist_to_bridge=MPIX_IO_distance();
201 
202  /* following could be replaced by MPIX_IO_link_id() */
203  factor=1;
204  bridgeid = personality.Network_Config.cnBridge_E;
205  factor *= personality.Network_Config.Enodes;
206  bridgeid += personality.Network_Config.cnBridge_D*factor;
207  factor *= personality.Network_Config.Dnodes;
208  bridgeid += personality.Network_Config.cnBridge_C*factor;
209  factor *= personality.Network_Config.Cnodes;
210  bridgeid += personality.Network_Config.cnBridge_B*factor;
211  factor *= personality.Network_Config.Bnodes;
212  bridgeid += personality.Network_Config.cnBridge_A*factor;
213 
214 
215  if(bluegene==2) {
216  /* per ION */
217  /* communicator consists of all task working with the same I/O-bridge */
218  if(bluegene_sort==0) {
219  key = MPIX_IO_distance();
220  } else {
221  key = communicators->all_rank;
222  }
223 
224  MPI_Comm_split(communicators->all, bridgeid, key, &commSame);
225  MPI_Comm_size(commSame, &sizeSame);
226  MPI_Comm_rank(commSame, &rankSame);
227  baseid=bridgeid;
228 
229  /* communicator consists of all task working with the different I/O-nodes */
230  MPI_Comm_split(communicators->all, rankSame, bridgeid, &commDiff);
231  MPI_Comm_size(commDiff, &sizeDiff);
232  MPI_Comm_rank(commDiff, &rankDiff);
233 
234  communicators->ionode_number = rankDiff;
235  } else {
236  /* similar to MPIX_Pset_same_comm_create but with ION-id */
237  MPI_Comm commSameIOB;
238  int sizeSameIOB;
239  int rankSameIOB;
240  int ionid;
241 
242  /* only needed for ionid distribution */
243  key = communicators->all_rank;
244  MPI_Comm_split(communicators->all, bridgeid, key, &commSameIOB);
245  MPI_Comm_size(commSameIOB, &sizeSameIOB);
246  MPI_Comm_rank(commSameIOB, &rankSameIOB);
247 
248  if(communicators->all_rank==0) {
249  printf("partest_split_comm[%d]: after split commSameIOB deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
250  }
251 
252  if(rankSameIOB==0) {
253  /* ionid=MPIX_IO_node_id() & 0xFFFF; */
254  ionid=MPIX_IO_node_id();
255  }
256  MPI_Bcast(&ionid, 1, MPI_INT, 0, commSameIOB);
257 
258  if(communicators->all_rank==0) {
259  printf("partest_split_comm[%d]: after get ionid deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
260  }
261 
262  color = (int) ionid;
263  if(bluegene_sort==0) {
264  key = MPIX_IO_distance();
265  } else {
266  key = communicators->all_rank;
267  }
268  int rc=0;
269  rc=MPI_Comm_split(communicators->all,color,key,&commSame);
270  MPI_Comm_size(commSame, &sizeSame);
271  MPI_Comm_rank(commSame, &rankSame);
272  baseid=ionid;
273 
274  if(communicators->all_rank==0) {
275  printf("partest_split_comm[%d]: after split commSame deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
276  }
277 
278  /* distribute global rank of first task in samecomm */
279  if(rankSame==0) baserank=communicators->all_rank;
280  MPI_Bcast(&baserank, 1, MPI_INT, 0, commSame);
281 
282  if(communicators->all_rank==0) {
283  printf("partest_split_comm[%d]: after bcast baserank deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
284  }
285 
286  /* similar to MPIX_Pset_diff_comm_create but with ION-id, but without: ... *hw.ppn+hw.coreID */
287  color = rankSame;
288  key = baserank; /* rank of task 0 of samecomm in MPI_COMM_WORLD */
289  MPI_Comm_split(communicators->all,color,key,&commDiff);
290  MPI_Comm_size(commDiff, &sizeDiff);
291  MPI_Comm_rank(commDiff, &rankDiff);
292 
293  if(communicators->all_rank==0) {
294  printf("partest_split_comm[%d]: after split commDiff deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
295  }
296 
297  }
298 
299  /* printf("WF: %02d of %02d here rSame=%2d rDiff=%2d bg_np=%2d --> factor=%2d, bridgeid=%2d %s -> bridge(%d,%d,%d,%d,%d)\n",communicators->all_rank,communicators->all_size, rankSame, rankDiff, bluegene_np, factor, bridgeid,location, */
300  /* personality.Network_Config.cnBridge_A, personality.Network_Config.cnBridge_B, personality.Network_Config.cnBridge_C, personality.Network_Config.cnBridge_D, personality.Network_Config.cnBridge_E); */
301 
302  /* if -p not specified all proc will write! */
303  if (bluegene_np == 0) {
304  bluegene_np = sizeSame;
305  }
306 
307  /* Get a communicator with all writing tasks => new global communicator */
308  /* TODO: better to MPI_UNDEFINED when rankSame >= bluegene_np */
309  MPI_Comm_split(communicators->all, (rankSame < bluegene_np), communicators->all_rank, &communicators->work);
310  MPI_Comm_size(communicators->work, &communicators->work_size);
311  MPI_Comm_rank(communicators->work, &communicators->work_rank);
312  if (rankSame >= bluegene_np) {
313  /* not working task */
314  communicators->work_size = communicators->work_rank = -1;
315  communicators->local_size = communicators->local_rank = -1;
316  }
317  if(communicators->all_rank==0) {
318  printf("partest_split_comm[%d]: after split work deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
319  }
320 
321  /* If only one file will be used => dont split further */
322  /* if numfile > 1 sion will generate correct local communicator */
323  if (numfiles >= 1) {
324  communicators->local = communicators->work;
325  }
326  else if(numfiles<0) {
327  if(numfiles==-1) {
328 #ifdef SPLITCASCADE
329  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
330  MPI_Comm_split(commSame, (rankSame < bluegene_np), rankSame, &communicators->local);
331 #else
332  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
333  color=(rankSame < bluegene_np)?baseid:MPI_UNDEFINED;
334  MPI_Comm_split(communicators->all, color, rankSame, &communicators->local);
335 #endif
336  if (rankSame < bluegene_np) {
337  MPI_Comm_size(communicators->local, &communicators->local_size);
338  MPI_Comm_rank(communicators->local, &communicators->local_rank);
339  }
340 
341  if(communicators->all_rank==0) {
342  printf("partest_split_comm[%d]: after split local deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
343  }
344 
345  } else {
346  /* local communicator contains only one task per IO-node */
347  /* bluegene_np has to be 512 */
348  communicators->local=commDiff;
349  MPI_Comm_size(communicators->local, &communicators->local_size);
350  MPI_Comm_rank(communicators->local, &communicators->local_rank);
351  }
352  }
353 
354  /* determine filenumber */
355  if (numfiles < 1) {
356  /* one file per I/O-node */
357  if(numfiles==-1) communicators->file_number = rankDiff;
358  else communicators->file_number = rankSame;
359  }
360  else {
361  communicators->file_number = -1;
362  }
363 
364  if(communicators->all_rank==0) {
365  printf("partest_split_comm[%d]: before print verbose deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
366  }
367  /* print log message about location, ... */
368  if (verbose) {
369  sprintf(cbuffer, "");
370  if (rankSame < bluegene_np) {
371  sprintf(cbuffer, "BGQ[%05d] diff_comm: %4d of %4d same_comm: %5d of %5d file_comm: %5d of %5d %s bridge=%d, dist=%d\n",
372  communicators->all_rank, rankDiff + 1, sizeDiff, rankSame + 1, sizeSame, communicators->local_rank + 1,
373  communicators->local_size, location,isonbridge,dist_to_bridge);
374  }
375  collective_print_gather(cbuffer, communicators->work);
376  }
377  if(communicators->all_rank==0) {
378  printf("partest_split_comm[%d]: after print verbose/end deltaWt=%10.3fs\n",communicators->all_rank,MPI_Wtime()-starttime);
379  }
380 
381  } /* if (bluegene) */
382 #endif
383 
384 #ifdef _SION_BGL
385  if (bluegene) { /* order MPI-tasks by I/O-node */
386  _BGP_Personality_t personality;
387  MPI_Comm commSame, commDiff;
388  int sizeSame, sizeDiff;
389  int rankSame, rankDiff;
390  char location[BGLPERSONALITY_MAX_LOCATION];
391  unsigned x, y, z, t;
392  BGLPersonality personality;
393  char cbuffer[MAXCHARLEN];
394 
395  /* get location information */
396  rts_get_personality(&personality, sizeof(personality));
397  BGLPersonality_getLocationString(&personality, location);
398  x = BGLPersonality_xCoord(&personality),
399  y = BGLPersonality_yCoord(&personality), z = BGLPersonality_zCoord(&personality), t = rts_get_processor_id()
400 
401  /* task of communicator working with different I/O-nodes */
402  PMI_Pset_diff_comm_create(&commDiff);
403  MPI_Comm_size(commDiff, &sizeDiff);
404  MPI_Comm_rank(commDiff, &rankDiff);
405  *ionode_number = rankDiff;
406 
407  /* communicator consists of all task working with the same I/O-node */
408  PMI_Pset_same_comm_create(&commSame);
409  MPI_Comm_size(commSame, &sizeSame);
410  MPI_Comm_rank(commSame, &rankSame);
411 
412  /* if -p not specified all proc will write! */
413  if (bluegene_np == 0) {
414  bluegene_np = sizeSame;
415  }
416 
417  /* Get a communicator with all writing tasks => new global communicator */
418  MPI_Comm_split(communicators->all, (rankSame < bluegene_np), communicators->all_rank, &communicators->work);
419  MPI_Comm_size(communicators->work, &communicators->work_size);
420  MPI_Comm_rank(communicators->work, &communicators->work_rank);
421 
422 
423  /* If only one file will be used => dont split further */
424  if (numfiles == 1) {
425  communicators->local = communicators->work;
426  }
427  else {
428  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
429  MPI_Comm_split(commSame, (rankSame < bluegene_np), communicators->all_rank, &communicators->local);
430  }
431  MPI_Comm_size(communicators->local, &communicators->local_size);
432  MPI_Comm_rank(communicators->local, &communicators->local_rank);
433 
434  /* determine filenumber */
435  if (numfiles < 1) {
436  /* one file per I/O-node */
437  communicators->file_number = rankDiff;
438  }
439  else {
440  communicators->file_number = -1;
441  }
442 
443  /* print log message about location, ... */
444  sprintf(cbuffer, "");
445  if (rankSame < bluegene_np) {
446  if (verbose) {
447  sprintf(cbuffer, "BGL[%05d] diff_comm: %4d of %4d same_comm: %5d of %5d file_comm: %5d of %5d %s phys_xyzt(%ud,%ud,%ud,%ud)\n",
448  communicators->all_rank, rankDiff + 1, sizeDiff, rankSame + 1, sizeSame,
449  communicators->local_rank + 1, communicators->local_size, location, x, y, z, t);
450  }
451  }
452  collective_print_gather(cbuffer, communicators->work);
453 
454  }
455 #endif
456 
457 #ifdef _SION_AIX
458  /* no communicator adjustment */
459 #endif
460 
461 #ifdef _SION_DARWIN
462  if (verbose) {
463  char location[256];
464  gethostname(location, 256);
465  char cbuffer[MAXCHARLEN];
466  sprintf(cbuffer, "DARWIN[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
467  communicators->all_rank, communicators->all_rank, communicators->all_size,
468  communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
469  collective_print_gather(cbuffer, communicators->all);
470  }
471 
472 #endif
473 
474 #if defined( _SION_LINUX) && (!defined(_SION_FX))
475  if (verbose) {
476  char location[256];
477  gethostname(location, 256);
478  char cbuffer[MAXCHARLEN];
479  sprintf(cbuffer, "LINUX[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
480  communicators->all_rank, communicators->all_rank, communicators->all_size,
481  communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
482  collective_print_gather(cbuffer, communicators->all);
483  }
484 
485 #endif
486 
487 #ifdef _SION_AIX
488  if (verbose) {
489  char location[256];
490  gethostname(location, 256);
491  int sizeSame = 0, sizeDiff = 0;
492  char cbuffer[MAXCHARLEN];
493  sprintf(cbuffer, "AIX[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
494  communicators->all_rank, communicators->all_rank, communicators->all_size,
495  communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
496  collective_print_gather(cbuffer, communicators->all);
497  }
498 
499 #endif
500 
501 #ifdef _SION_FX
502  if (bluegene) { /* order MPI-tasks by I/O-node */
503  int rank, x, y, z, a, b, c, rc;
504  char location[256];
505  char cbuffer[MAXCHARLEN];
506  int ionodeid;
507  MPI_Comm commSame, commDiff;
508  int sizeSame = 0, sizeDiff;
509  int rankSame = 0, rankDiff;
510 
511  gethostname(location, 256);
512  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
513  rc=FJMPI_Topology_sys_rank2xyzabc(rank, &x, &y, &z, &a, &b, &c);
514 
515  ionodeid=x * 65536 + y;
516 
517  if(bluegene>0) {
518  /* per ION */
519  /* communicator consists of all task working with the same I/O-node */
520  MPI_Comm_split(communicators->all, ionodeid, communicators->all_rank, &commSame);
521  MPI_Comm_size(commSame, &sizeSame);
522  MPI_Comm_rank(commSame, &rankSame);
523 
524  /* communicator consists of all task working with the different I/O-nodes */
525  MPI_Comm_split(communicators->all, rankSame, ionodeid, &commDiff);
526  MPI_Comm_size(commDiff, &sizeDiff);
527  MPI_Comm_rank(commDiff, &rankDiff);
528 
529  communicators->ionode_number = rankDiff;
530 
531  } else {
532  bluegene_np = sizeSame;
533  }
534 
535  /* Get a communicator with all writing tasks => new global communicator */
536  MPI_Comm_split(communicators->all, (rankSame < bluegene_np), communicators->all_rank, &communicators->work);
537  MPI_Comm_size(communicators->work, &communicators->work_size);
538  MPI_Comm_rank(communicators->work, &communicators->work_rank);
539  if (rankSame >= bluegene_np) {
540  /* not working task */
541  communicators->work_size = communicators->work_rank = -1;
542  communicators->local_size = communicators->local_rank = -1;
543  }
544 
545  /* If only one file will be used => dont split further */
546  /* if numfile > 1 sion will generate correct local communicator */
547  if (numfiles >= 1) {
548  communicators->local = communicators->work;
549  }
550  else if(numfiles<0) {
551  if(numfiles==-1) {
552  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
553  MPI_Comm_split(commSame, (rankSame < bluegene_np), communicators->all_rank, &communicators->local);
554  } else {
555  /* local communicator contains only one task per IO-node */
556  /* bluegene_np has to be 512 */
557  communicators->local=commDiff;
558  }
559  }
560  MPI_Comm_size(communicators->local, &communicators->local_size);
561  MPI_Comm_rank(communicators->local, &communicators->local_rank);
562 
563  /* determine filenumber */
564  if (numfiles < 1) {
565  /* one file per I/O-node */
566  if(numfiles==-1) communicators->file_number = rankDiff;
567  else communicators->file_number = rankSame;
568  }
569  else {
570  communicators->file_number = -1;
571  }
572 
573  /* print log message about location, ... */
574  sprintf(cbuffer, "");
575  if (rankSame < bluegene_np) {
576  sprintf(cbuffer, "FX[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %dx%dx%d %dx%dx%d %s ioid=%d ion=%d rc=%d numfiles=%d\n",
577 
578  communicators->all_rank, rankDiff, sizeDiff, rankSame, sizeSame,
579  communicators->local_rank, communicators->local_size,
580  x,y,z,a,b,c, location, ionodeid, communicators->ionode_number, rc, numfiles);
581  }
582  collective_print_gather(cbuffer, communicators->all);
583 
584  }
585 #endif
586 
587 
588 #ifdef _SION_XT
589  if (verbose) {
590  char location[256];
591  gethostname(location, 256);
592  int sizeSame = 0, sizeDiff = 0;
593  char cbuffer[MAXCHARLEN];
594  int rc,nid;
595  rca_mesh_coord_t xyz;
596 
597  rc=PMI_Get_nid(communicators->all_rank,&nid);
598 
599  rca_get_meshcoord(nid, &xyz);
600 
601  sprintf(cbuffer, "XT[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %s nid=%d phys_xyz(%d,%d,%d)\n",
602  communicators->all_rank, communicators->all_rank, communicators->all_size,
603  communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location,
604  nid, xyz.mesh_x,xyz.mesh_y,xyz.mesh_z);
605  collective_print_gather(cbuffer, communicators->all);
606  }
607 #endif
608 
609 
610  /* initial set of communicators not changed? */
611  if (communicators->work_size == -2) {
612  /* all task do work */
613  communicators->work = communicators->all;
614  MPI_Comm_size(communicators->work, &communicators->work_size);
615  MPI_Comm_rank(communicators->work, &communicators->work_rank);
616  }
617  /* local communicators */
618  if (communicators->local_size == -2) {
619  if (numfiles == 1) {
620  communicators->local = communicators->work;
621  }
622  /* set a default distribution on files, will be computed again by sion_open */
623  if (numfiles < 1) {
624  numfiles = communicators->work_size / 2;
625  if (numfiles == 0)
626  numfiles = 1;
627  }
628  proc_per_file = communicators->work_size / numfiles;
629 
630  /* remaining tasks are write/read to/from the last file */
631  if (communicators->work_rank >= (numfiles * proc_per_file)) {
632  communicators->file_number = numfiles - 1;
633  }
634  else {
635  communicators->file_number = communicators->work_rank / proc_per_file;
636  }
637 
638  MPI_Comm_split(communicators->work, communicators->file_number, communicators->all_rank, &communicators->local);
639 
640  MPI_Comm_size(communicators->local, &communicators->local_size);
641  MPI_Comm_rank(communicators->local, &communicators->local_rank);
642 
643  communicators->ionode_number = communicators->file_number;
644 
645  }
646 
647  /* shift working tasks */
648  if (communicators->work_size != -1) {
649  /* only if task in communicator work */
650  int newtasknr;
651  newtasknr=(communicators->work_rank+read_task_offset)%communicators->work_size;
652  MPI_Comm_split(communicators->work, 0, newtasknr, &communicators->workread);
653 
654  MPI_Comm_size(communicators->workread, &communicators->workread_size);
655  MPI_Comm_rank(communicators->workread, &communicators->workread_rank);
656  /* printf("WF: %d %d %% %d-> %d (%d %d)\n",
657  communicators->work_rank,read_task_offset,
658  communicators->work_size,newtasknr,
659  communicators->workread_rank,communicators->workread_size);*/
660 
661  } else {
662  /* this rtask will not be used for reading */
663  communicators->workread_size = communicators->workread_rank = -1;
664  communicators->local_size = communicators->local_rank = -1;
665  }
666 
667  return(1);
668 }
669