SIONlib  1.6.2
Scalable I/O library for parallel access to task-local files
ompi_partest.c
Go to the documentation of this file.
1 /****************************************************************************
2 ** SIONLIB http://www.fz-juelich.de/jsc/sionlib **
3 *****************************************************************************
4 ** Copyright (c) 2008-2016 **
5 ** Forschungszentrum Juelich, Juelich Supercomputing Centre **
6 ** **
7 ** See the file COPYRIGHT in the package base directory for details **
8 ****************************************************************************/
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <ctype.h>
14 #include <mpi.h>
15 #include <time.h>
16 #include <math.h>
17 
18 #include "sion_debug.h"
19 #include "sion_printts.h"
20 #include "sion.h"
21 #include "ompi_partest.h"
22 #include "partest_opts.h"
23 
24 #include <omp.h>
25 
26 
27 #ifdef _SION_BGP
28 /* #include <mpix.h> */
29 #include <common/bgp_personality.h>
30 #include <common/bgp_personality_inlines.h>
31 #endif
32 
33 #ifdef _SION_XT
34 #include <unistd.h>
35 #endif
36 
37 #ifdef _SION_AIX
38 #include <unistd.h>
39 #endif
40 
41 #ifdef _SION_LINUX
42 #include <unistd.h>
43 #endif
44 
55 int barrier_after_start(MPI_Comm comm)
56 {
57  #pragma omp master
58  {
59  MPI_Barrier(comm);
60  }
61  #pragma omp barrier
62  return (1);
63 }
64 
65 int barrier_after_malloc(MPI_Comm comm)
66 {
67  #pragma omp master
68  {
69  MPI_Barrier(comm);
70  }
71  #pragma omp barrier
72 
73  return (1);
74 }
75 
76 int barrier_after_open(MPI_Comm comm)
77 {
78  #pragma omp master
79  {
80  MPI_Barrier(comm);
81  }
82  #pragma omp barrier
83  return (1);
84 }
85 
86 int barrier_after_write(MPI_Comm comm)
87 {
88  #pragma omp master
89  {
90  MPI_Barrier(comm);
91  }
92  #pragma omp barrier
93  return (1);
94 }
95 
96 int barrier_after_read(MPI_Comm comm)
97 {
98  #pragma omp master
99  {
100  MPI_Barrier(comm);
101  }
102  #pragma omp barrier
103  return (1);
104 }
105 
106 int barrier_after_close(MPI_Comm comm)
107 {
108  #pragma omp master
109  {
110  MPI_Barrier(comm);
111  }
112  #pragma omp barrier
113  return (1);
114 }
115 
116 int barrier_before_unlink(MPI_Comm comm)
117 {
118  #pragma omp master
119  {
120  MPI_Barrier(comm);
121  }
122  #pragma omp barrier
123  return (1);
124 }
125 
126 int barrier_after_unlink(MPI_Comm comm)
127 {
128  #pragma omp master
129  {
130  MPI_Barrier(comm);
131  }
132  #pragma omp barrier
133  return (1);
134 }
135 
136 static char * __collective_print_pointer;
137 
138 int collective_print_gather(char *cbuffer, MPI_Comm comm )
139 {
140  int rank, size, p;
141  char *lbuffer;
142  int num_threads = omp_get_num_threads();
143 
144  #pragma omp barrier
145 
146  #pragma omp master
147  {
148  __collective_print_pointer = (char *) malloc(MAXCHARLEN * num_threads);
149  }
150 
151  #pragma omp barrier
152 
153  memcpy(__collective_print_pointer+(MAXCHARLEN * omp_get_thread_num()),cbuffer,MAXCHARLEN);
154 
155  #pragma omp barrier
156 
157  #pragma omp master
158  {
159  MPI_Comm_size(comm, &size);
160  MPI_Comm_rank(comm, &rank);
161  if(rank==0) lbuffer = (char *) malloc(MAXCHARLEN * num_threads * size);
162  else lbuffer = NULL;
163 
164  MPI_Gather(__collective_print_pointer, MAXCHARLEN*num_threads, MPI_CHAR, lbuffer, MAXCHARLEN*num_threads, MPI_CHAR, 0, comm);
165 
166 
167  if (rank == 0) {
168 
169  for (p = 0; p < (size*num_threads); p++) {
170 
171  fprintf(stderr, "%s", lbuffer + p * MAXCHARLEN);
172  }
173  }
174 
175  if(rank==0) free(lbuffer);
176 
177  free(__collective_print_pointer);
178  }
179 #pragma omp barrier
180 
181  return (1);
182 }
183 
184 
194 static void * __thread_sync_pointer;
195 
196 void reduce_omp(void *syncdata, void * out, MPI_Op op, int dtype)
197 {
198  int thread_num = omp_get_thread_num();
199  int num_threads = omp_get_num_threads();
200  {
201  #pragma omp barrier
202  }
203  #pragma omp master
204  {
205  switch (dtype) {
206  case _PARTEST_SION_INT32:
207  {
208  __thread_sync_pointer = malloc(sizeof(sion_int32)*num_threads);
209  }
210  break;
211  case _PARTEST_SION_INT64:
212  {
213  __thread_sync_pointer = malloc(sizeof(sion_int64)*num_threads);
214  }
215  break;
216  case _PARTEST_DOUBLE:
217  {
218  __thread_sync_pointer = malloc(sizeof(double)*num_threads);
219  }
220  break;
221  default:
222  {
223  __thread_sync_pointer = malloc(sizeof(sion_int64)*num_threads);
224  }
225  break;
226  }
227  }
228  {
229  #pragma omp barrier
230  }
231  switch (dtype) {
232  case _PARTEST_SION_INT32:
233  {
234  ((sion_int32 *)__thread_sync_pointer)[thread_num] = * (sion_int32*)syncdata;
235  }
236  break;
237  case _PARTEST_SION_INT64:
238  {
239  ((sion_int64 *)__thread_sync_pointer)[thread_num] = *((sion_int64*)syncdata);
240  }
241  break;
242  case _PARTEST_DOUBLE:
243  {
244  ((double *)__thread_sync_pointer)[thread_num] = *(double*)syncdata;
245  }
246  break;
247  default:
248  {
249  ((sion_int64 *)__thread_sync_pointer)[thread_num] = *(sion_int64*)syncdata;
250  }
251  break;
252  }
253  {
254  #pragma omp barrier
255  }
256 
257  #pragma omp master
258  {
259  int i;
260  switch (dtype) {
261  case _PARTEST_SION_INT32:
262  {
263  if(op == MPI_SUM){
264  *((sion_int32 *) out) = 0;
265  for(i=0;i<num_threads;i++){
266  *((sion_int32 *) out) += ((sion_int32 *)__thread_sync_pointer)[i];
267  }
268  }else if(op == MPI_MAX){
269  *((sion_int32 *) out) = ((sion_int32 *)__thread_sync_pointer)[0];
270  for(i=1;i<num_threads;i++){
271  if(((sion_int32 *)__thread_sync_pointer)[i] > *((sion_int32 *) out)) *((sion_int32 *) out) = ((sion_int32 *)__thread_sync_pointer)[i];
272  }
273  }
274  }
275  break;
276  case _PARTEST_SION_INT64:
277  {
278  if(op == MPI_SUM){
279  *((sion_int64 *) out) = 0;
280  for(i=0;i<num_threads;i++){
281  *((sion_int64 *) out) += ((sion_int64 *)__thread_sync_pointer)[i];
282  }
283  }else if(op == MPI_MAX){
284  *((sion_int64 *) out) = ((sion_int64 *)__thread_sync_pointer)[0];
285  for(i=1;i<num_threads;i++){
286  if(((sion_int64 *)__thread_sync_pointer)[i] > *((sion_int64 *) out)) *((sion_int64 *) out) = ((sion_int64 *)__thread_sync_pointer)[i];
287  }
288  }
289  }
290  break;
291  case _PARTEST_DOUBLE:
292  {
293  if(op == MPI_SUM){
294  *((double *) out) = 0;
295  for(i=0;i<num_threads;i++){
296  *((double *) out) += ((double *)__thread_sync_pointer)[i];
297  }
298  }else if(op == MPI_MAX){
299  *((double *) out) = ((double *)__thread_sync_pointer)[0];
300  for(i=1;i<num_threads;i++){
301  if(((double *)__thread_sync_pointer)[i] > *((double *) out)) *((double *) out) = ((double *)__thread_sync_pointer)[i];
302  }
303  }
304  }
305  break;
306  default:
307  {
308  if(op == MPI_SUM){
309  *((sion_int64 *) out) = 0;
310  for(i=0;i<num_threads;i++){
311  *((sion_int64 *) out) += ((sion_int64 *)__thread_sync_pointer)[i];
312  }
313  }else if(op == MPI_MAX){
314  *((sion_int64 *) out) = ((sion_int64 *)__thread_sync_pointer)[0];
315  for(i=1;i<num_threads;i++){
316  if(((sion_int64 *)__thread_sync_pointer)[i] > *((sion_int64 *) out)) *((sion_int64 *) out) = ((sion_int64 *)__thread_sync_pointer)[i];
317  }
318  }
319  }
320  break;
321  }
322  free(__thread_sync_pointer);
323  }
324 #pragma omp barrier
325 }
326 
327 int split_communicator(_test_communicators * communicators, int bluegene, int bluegene_np, int numfiles, int read_task_offset, int verbose);
328 
329 
330 int main(int argc, char **argv)
331 {
332  int rank, size, rc;
333  char *localbuffer;
334  time_t t;
335  sion_int64 commwork_size64 = 1;
336 
337  /* communicators */
338  _test_communicators communicators;
339 
340  /* options */
341  _test_options options;
342 
343  MPI_Init(&argc, &argv);
344  MPI_Comm_size(MPI_COMM_WORLD, &size);
345  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
346  /* */ DPRINTFTS(rank, "after MPI_Init");
347 
348  /* srand ( time(NULL)*rank ); */
349 
350  /* printf("starting partest %02d of %02d\n", rank,size); */
351 
352  init_options(&options);
353 
354  if (rank == 0) {
355 #ifdef _SION_AIX
356  parse_options_std(argc, argv, &options);
357  if(rc==0) {
358  usage(argv[0]);
359  }
360 #else
361  rc=parse_options_long(argc, argv, &options);
362  if(rc==0) {
363  usage_long(argv[0]);
364  }
365 #endif
366  }
367  MPI_Bcast(&rc, 1, MPI_INT, 0, MPI_COMM_WORLD);
368  if(rc==0) {
369  MPI_Abort(MPI_COMM_WORLD, 1);
370  }
371 
372  distribute_options_mpi(&options);
373 
374 
375  /* adjust communicators */
376  communicators.all = MPI_COMM_WORLD;
377  MPI_Comm_size(MPI_COMM_WORLD, &communicators.all_size);
378  MPI_Comm_rank(MPI_COMM_WORLD, &communicators.all_rank);
379  split_communicator(&communicators, options.bluegene, options.bluegene_np, options.numfiles, options.read_task_offset, options.verbose);
380 
381  /* determine global and local size of data to be written and read */
382  sion_int64 num_threads = (sion_int64) omp_get_max_threads();
383  MPI_Allreduce(&num_threads, &commwork_size64, 1, SION_MPI_INT64, MPI_SUM, communicators.work);
384 
385  if (options.globalsize > 0) {
386  options.totalsize = (sion_int64) options.globalsize / commwork_size64;
387  }
388  else {
389  options.globalsize = options.totalsize * commwork_size64;
390  }
391 
392  if((options.totalsize>options.bufsize) || (options.read_task_offset>0) || (options.do_write==0)) {
393  options.suppress_checksum=1;
394  }
395 
396  if(options.fsblksize<0) options.fsblksize=-1;
397 
398  if ( (communicators.work_size>0) && (communicators.work_rank==0) ) {
399  time(&t);
400  fprintf(stderr, "------------------------------------------------------------------------------------------\n");
401  fprintf(stderr, "SION parallel file I/O benchmark 'ompi_partest': start at %s", ctime(&t));
402  fprintf(stderr, "partest Number of MPI tasks that will use the file tasks: running on %d tasks\n", size);
403  fprintf(stderr, "------------------------------------------------------------------------------------------\n");
404 #ifndef CHECKSUM
405  fprintf(stderr, "partest parameter: CHECKSUM DISABLED!\n\n");
406 #else
407  if(options.suppress_checksum) {
408  fprintf(stderr, "partest parameter: CHECKSUM not possible, DISABLED!\n\n");
409  }
410 #endif
411  fprintf(stderr, "partest parameter: (-f) datafile = %s\n", options.filename);
412  fprintf(stderr, "partest parameter: (-n) number of files = %d\n", options.numfiles);
413  fprintf(stderr, "partest parameter: (-F) random factor = %13.4f\n", options.factor);
414  fprintf(stderr, "partest parameter: (-X) remove files after test = %d\n", options.unlink_files);
415  fprintf(stderr, "partest parameter: (-b/-B) local buffer size / sion task = %15lld bytes %10.3f MB\n", options.bufsize, options.bufsize / (1.0 MB));
416  fprintf(stderr, "partest parameter: (-g/-G) global total data size = %15lld bytes %10.3f GB\n", options.globalsize, options.globalsize / (1024.0 MB));
417  fprintf(stderr, "partest parameter: (-s/-S) total data size / sion task = %15lld bytes %10.3f MB\n", options.totalsize, options.totalsize / (1.0 MB));
418  fprintf(stderr, "partest parameter: (-r/-R) sion chunk size = %15lld bytes %10.3f MB\n", options.chunksize, options.chunksize / (1.0 MB));
419  fprintf(stderr, "partest parameter: (-Q) fs block size = %15d bytes %10.3f MB\n", options.fsblksize, options.fsblksize / (1.0 MB));
420  if (options.type == 0)
421  fprintf(stderr, "partest parameter: (-T) test type = %d (sion OMPI, collective read)\n", options.type);
422  if (options.type == 1)
423  fprintf(stderr, "partest parameter: (-T) test type = %d (sion OMPI, independant read)\n", options.type);
424  if (options.type == 2)
425  fprintf(stderr, "partest parameter: (-T) test type = %d (sion OMP, collective read)\n", options.type);
426  if (options.type == 3)
427  fprintf(stderr, "partest parameter: (-T) test type = %d (sion OMP, independant read)\n", options.type);
428  fprintf(stderr, "partest parameter: (-j) serialize_blocknum = %d\n", options.serialize_blocknum);
429  fprintf(stderr, "partest parameter: (-Z) read task offset = %d\n", options.read_task_offset);
430  fprintf(stderr, "partest parameter: (-o) start offset bytes = %d\n", options.startoffset);
431  fprintf(stderr, "partest parameter: (-v) verbose = %d\n", options.verbose);
432  fprintf(stderr, "partest parameter: (-d) debug = %d\n", options.debug);
433  fprintf(stderr, "partest parameter: (-D) Debug = %d\n", options.Debug);
434  fprintf(stderr, "partest parameter: (-M) collective write = %d\n", options.collectivewrite);
435  fprintf(stderr, "partest parameter: (-m) collective read = %d\n", options.collectiveread);
436 
437 
438  fprintf(stderr, "partest parameter: (-P) Blue Gene, I/O nodes = %d\n", options.bluegene);
439  fprintf(stderr, "partest parameter: () Blue Gene: tasks/IO-node = %d\n", options.bluegene_np);
440  fprintf(stderr, "partest parameter: (-w) MPI-IO, IBM, Large Block IO = %d\n", options.mpiio_lb);
441  fprintf(stderr, "partest parameter: (-W) MPI-IO, IBM, IO bufsize = %d KB\n", options.mpiio_bs);
442  fprintf(stderr, "partest parameter: (-x) MPI-IO, IBM, sparse access = %d\n", options.mpiio_sa);
443  fprintf(stderr, "partest parameter: ( ) OpenMP number of threads = %d\n", omp_get_max_threads());
444  fprintf(stderr, "partest parameter: ( ) commwork_size64 = %lld\n", commwork_size64);
445  fprintf(stderr, "partest parameter: ( ) suppress_checksum = %d\n", options.suppress_checksum);
446  fprintf(stderr, "partest parameter: ( ) do_write = %d\n", options.do_write);
447  fprintf(stderr, "partest parameter: ( ) do_read = %d\n", options.do_read);
448  fprintf(stderr, "partest parameter: ( ) use_posix = %d\n", options.use_posix);
449 
450  }
451 
452  barrier_after_start(MPI_COMM_WORLD);
453 
454  if ( (communicators.work_size>0) && (communicators.work_rank==0) ) {
455  fprintf(stderr, "partest parameter: ( ) comm(all) = %d of %d\n", communicators.all_rank, communicators.all_size);
456  fprintf(stderr, "partest parameter: ( ) comm(work) = %d of %d\n", communicators.work_rank, communicators.work_size);
457  fprintf(stderr, "partest parameter: ( ) comm(local) = %d of %d\n", communicators.local_rank, communicators.local_size);
458  fprintf(stderr, "------------------------------------------------------------------------------------------\n");
459  }
460  #pragma omp parallel private(localbuffer,t)
461  {
462  barrier_after_start(MPI_COMM_WORLD);
463  char l_filename[MAXCHARLEN];
464  strcpy(l_filename,options.filename);
465 
466  _test_communicators local_communicators = communicators;
467  _test_options local_options = options;
468  /* */ DPRINTFTS(rank, "after pstart");
469  /* Init the local buffer that will be written to the file */
470  localbuffer = (char *) malloc(options.bufsize);
471 
472  srand(time(NULL)*local_communicators.work_rank*omp_get_thread_num());
473  /* for (i = 0; i < (options.bufsize / sizeof(int)); i++) */
474  /* localbuffer[i] = (char) rand() % 256; */
475 
476  /* memset (localbuffer, 'a'+rank%26, bufsize); */
477  memset (localbuffer, 'a'+rank%26, local_options.bufsize);
478  barrier_after_malloc(MPI_COMM_WORLD);
479  /* */ DPRINTFTS(rank, "after malloc");
480 
481  /* random factor handling */
482 
483  if(local_options.factor>0.0) {
484  if((local_options.collectivewrite) || (local_options.collectiveread)) {
485  if(local_options.bufsize<local_options.totalsize*(1+local_options.factor)) {
486  #pragma omp master
487  {
488  fprintf(stderr, "partest: ERROR deadlock possible if collective read/write and random factor used, and buffer is too small aborting\n");
489  MPI_Abort(MPI_COMM_WORLD,0);
490  }
491 #pragma omp barrier
492  exit(0);
493  }
494  }
495 
496  local_options.totalsize += ((sion_int64) (local_options.factor * (sion_int64) local_options.totalsize * (sion_int64) rand() / (sion_int64) RAND_MAX));
497  local_options.chunksize += ((sion_int64) (local_options.factor * (sion_int64) local_options.totalsize * (sion_int64) rand() / (sion_int64) RAND_MAX));
498  fprintf(stderr, "partest parameter: ( ) new totalsize[%4d,t%4d] = %lld\n", local_communicators.work_rank,omp_get_thread_num(),local_options.totalsize);
499  }
500 
501 
502 
503  /* */ DPRINTFTS(rank, "before scall2");
504  #pragma omp master
505  {
506  if ( (local_communicators.work_size>0) && (local_communicators.work_rank==0) ) {
507  fprintf(stderr, "partest parameter: ( ) new totalsize = %lld\n", local_options.totalsize);
508  }
509  }
510 #pragma omp barrier
511  barrier_after_malloc(MPI_COMM_WORLD);
512  if (local_options.type == 0) {
513  local_options.collectiveopenforread = 1;
514  test_paropen_multi_ompi(l_filename, localbuffer, &local_communicators, &local_options);
515 
516  }else if(local_options.type == 1) {
517  local_options.collectiveopenforread = 0;
518  test_paropen_multi_ompi(l_filename, localbuffer, &local_communicators, &local_options);
519  }else if(local_options.type == 2) {
520  local_options.collectiveopenforread = 1;
521  test_paropen_omp(l_filename, localbuffer, &local_communicators, &local_options);
522  }else if(local_options.type == 3) {
523  local_options.collectiveopenforread = 0;
524  test_paropen_omp(l_filename, localbuffer, &local_communicators, &local_options);
525  }
526 
527  /* */ DPRINTFTS(rank, "before MPI_Finalize");
528  barrier_after_malloc(MPI_COMM_WORLD);
529  #pragma omp master
530  {
531  if ( (local_communicators.work_size>0) && (local_communicators.work_rank==0) ) {
532  time(&t);
533  fprintf(stderr, "SION parallel file I/O benchmark 'ompi_partest': end at %s\n", ctime(&t));
534  }
535  }
536 #pragma omp barrier
537  }
538 
539  MPI_Finalize();
540  return (0);
541 }
542 
543 
544 int split_communicator(_test_communicators * communicators, int bluegene, int bluegene_np, int numfiles, int read_task_offset, int verbose)
545 {
546  int proc_per_file;
547 
548  communicators->work_size = communicators->work_rank = -2;
549  communicators->local_size = communicators->local_rank = -2;
550 
551 
552 
553 #ifdef _SION_BGP
554  if (bluegene) { /* order MPI-tasks by I/O-node */
555  _BGP_Personality_t personality;
556  MPI_Comm commSame, commDiff, commTemp;
557  int sizeSame, sizeDiff;
558  int rankSame, rankDiff;
559  char location[128];
560  unsigned procid, x, y, z, t;
561  char cbuffer[MAXCHARLEN];
562 
563  /* get location information */
564  Kernel_GetPersonality(&personality, sizeof(personality));
565  BGP_Personality_getLocationString(&personality, location);
566  procid = Kernel_PhysicalProcessorID();
567  MPIX_rank2torus(communicators->all_rank, &x, &y, &z, &t);
568 
569  /* task of communicator working with different I/O-nodes */
570  MPIX_Pset_diff_comm_create(&commDiff);
571  MPI_Comm_size(commDiff, &sizeDiff);
572  MPI_Comm_rank(commDiff, &rankDiff);
573  communicators->ionode_number = rankDiff;
574 
575  /* communicator consists of all task working with the same I/O-node */
576  MPIX_Pset_same_comm_create(&commSame);
577  MPI_Comm_size(commSame, &sizeSame);
578  MPI_Comm_rank(commSame, &rankSame);
579 
580  /* if -p not specified all proc will write! */
581  if (bluegene_np == 0) {
582  bluegene_np = sizeSame;
583  }
584 
585  /* Get a communicator with all writing tasks => new global communicator */
586  MPI_Comm_split(communicators->all, (rankSame < bluegene_np), communicators->all_rank, &communicators->work);
587  MPI_Comm_size(communicators->work, &communicators->work_size);
588  MPI_Comm_rank(communicators->work, &communicators->work_rank);
589  if (rankSame >= bluegene_np) {
590  /* not working task */
591  communicators->work_size = communicators->work_rank = -1;
592  communicators->local_size = communicators->local_rank = -1;
593  }
594 
595  /* If only one file will be used => dont split further */
596  /* if numfile > 1 sion will generate correct local communicator */
597  if (numfiles >= 1) {
598  communicators->local = communicators->work;
599  }
600  else if(numfiles<0) {
601  if(numfiles==-1) {
602  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
603  MPI_Comm_split(commSame, (rankSame < bluegene_np), communicators->all_rank, &communicators->local);
604  } else {
605  /* local communicator contains only one task per IO-node */
606  /* bluegene_np has to be 512 */
607  communicators->local=commDiff;
608  }
609  }
610  MPI_Comm_size(communicators->local, &communicators->local_size);
611  MPI_Comm_rank(communicators->local, &communicators->local_rank);
612 
613  /* determine filenumber */
614  if (numfiles < 1) {
615  /* one file per I/O-node */
616  if(numfiles==-1) communicators->file_number = rankDiff;
617  else communicators->file_number = rankSame;
618  }
619  else {
620  communicators->file_number = -1;
621  }
622 
623  /* print log message about location, ... */
624  sprintf(cbuffer, "");
625  if (rankSame < bluegene_np) {
626  if (verbose) {
627  sprintf(cbuffer, "BGP[%05d] diff_comm: %4d of %4d same_comm: %5d of %5d file_comm: %5d of %5d %s phys_xyzt(%d,%d,%d,%d)\n",
628  communicators->all_rank, rankDiff + 1, sizeDiff, rankSame + 1, sizeSame, communicators->local_rank + 1, communicators->local_size,
629  location, x, y, z, t);
630  }
631  }
632  collective_print_gather(cbuffer, communicators->work);
633 
634  }
635 #endif
636 
637 #ifdef _SION_BGL
638  if (bluegene) { /* order MPI-tasks by I/O-node */
639  _BGP_Personality_t personality;
640  MPI_Comm commSame, commDiff, commTemp;
641  int sizeSame, sizeDiff;
642  int rankSame, rankDiff;
643  char location[BGLPERSONALITY_MAX_LOCATION];
644  unsigned procid, x, y, z, t;
645  BGLPersonality personality;
646  char cbuffer[MAXCHARLEN];
647 
648  /* get location information */
649  rts_get_personality(&personality, sizeof(personality));
650  BGLPersonality_getLocationString(&personality, location);
651  x = BGLPersonality_xCoord(&personality),
652  y = BGLPersonality_yCoord(&personality), z = BGLPersonality_zCoord(&personality), t = rts_get_processor_id()
653 
654  /* task of communicator working with different I/O-nodes */
655  PMI_Pset_diff_comm_create(&commDiff);
656  MPI_Comm_size(commDiff, &sizeDiff);
657  MPI_Comm_rank(commDiff, &rankDiff);
658  *ionode_number = rankDiff;
659 
660  /* communicator consists of all task working with the same I/O-node */
661  PMI_Pset_same_comm_create(&commSame);
662  MPI_Comm_size(commSame, &sizeSame);
663  MPI_Comm_rank(commSame, &rankSame);
664 
665  /* if -p not specified all proc will write! */
666  if (bluegene_np == 0) {
667  bluegene_np = sizeSame;
668  }
669 
670  /* Get a communicator with all writing tasks => new global communicator */
671  MPI_Comm_split(communicators->all, (rankSame < bluegene_np), communicators->all_rank, &communicators->work);
672  MPI_Comm_size(communicators->work, &communicators->work_size);
673  MPI_Comm_rank(communicators->work, &communicators->work_rank);
674 
675 
676  /* If only one file will be used => dont split further */
677  if (numfiles == 1) {
678  communicators->local = communicators->work;
679  }
680  else {
681  /* Split the common communicator for each IO node to get a local comm with only the writing tasks for this IO Node */
682  MPI_Comm_split(commSame, (rankSame < bluegene_np), communicators->all_rank, &communicators->local);
683  }
684  MPI_Comm_size(communicators->local, &communicators->local_size);
685  MPI_Comm_rank(communicators->local, &communicators->local_rank);
686 
687  /* determine filenumber */
688  if (numfiles < 1) {
689  /* one file per I/O-node */
690  communicators->file_number = rankDiff;
691  }
692  else {
693  communicators->file_number = -1;
694  }
695 
696  /* print log message about location, ... */
697  sprintf(cbuffer, "");
698  if (rankSame < bluegene_np) {
699  if (verbose) {
700  sprintf(cbuffer, "BGL[%05d] diff_comm: %4d of %4d same_comm: %5d of %5d file_comm: %5d of %5d %s phys_xyzt(%d,%d,%d,%d)\n",
701  communicators->all_rank, rankDiff + 1, sizeDiff, rankSame + 1, sizeSame,
702  communicators->local_rank + 1, communicators->local_size, location, x, y, z, t);
703  }
704  }
705  collective_print_gather(cbuffer, communicators->work);
706 
707  }
708 #endif
709 
710 #ifdef _SION_AIX
711  /* no communicator adjustment */
712 #endif
713 
714  /* initial set of communicators */
715  if (communicators->work_size == -2) {
716  /* all task do work */
717  communicators->work = communicators->all;
718  MPI_Comm_size(communicators->work, &communicators->work_size);
719  MPI_Comm_rank(communicators->work, &communicators->work_rank);
720  }
721  /* local communicators */
722  if (communicators->local_size == -2) {
723  if (numfiles == 1) {
724  communicators->local = communicators->work;
725  }
726  /* set a default distribution on files, will be computed again by sion_open */
727  if (numfiles < 1) {
728  numfiles = communicators->work_size / 2;
729  if (numfiles == 0)
730  numfiles = 1;
731  }
732  proc_per_file = communicators->work_size / numfiles;
733 
734  /* remaining tasks are write/read to/from the last file */
735  if (communicators->work_rank >= (numfiles * proc_per_file)) {
736  communicators->file_number = numfiles - 1;
737  }
738  else {
739  communicators->file_number = communicators->work_rank / proc_per_file;
740  }
741 
742  MPI_Comm_split(communicators->work, communicators->file_number, communicators->all_rank, &communicators->local);
743 
744  MPI_Comm_size(communicators->local, &communicators->local_size);
745  MPI_Comm_rank(communicators->local, &communicators->local_rank);
746 
747  communicators->ionode_number = communicators->file_number;
748 
749  }
750 
751 #ifdef _SION_LINUX
752  if (verbose) {
753  char location[256];
754  gethostname(location, 256);
755  char cbuffer[MAXCHARLEN];
756  sprintf(cbuffer, "LINUX[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
757  communicators->all_rank, communicators->all_rank, communicators->all_size,
758  communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
759  collective_print_gather(cbuffer, communicators->all);
760  }
761 
762 #endif
763 
764  /* _XT TODO */
765 #ifdef _SION_AIX
766  if (verbose) {
767  char location[256];
768  gethostname(location, 256);
769  int sizeSame = 0, sizeDiff = 0;
770  int rankSame, rankDiff;
771  char cbuffer[MAXCHARLEN];
772  sprintf(cbuffer, "AIX[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
773  communicators->all_rank, communicators->all_rank, communicators->all_size,
774  communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
775  collective_print_gather(cbuffer, communicators->all);
776  }
777 
778 #endif
779 
780 #ifdef _SION_XT
781  if (verbose) {
782  char location[256];
783  gethostname(location, 256);
784  int sizeSame = 0, sizeDiff = 0;
785  int rankSame, rankDiff;
786  char cbuffer[MAXCHARLEN];
787  sprintf(cbuffer, "XT[%03d] diff_comm: %4d of %4d same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
788  communicators->all_rank, communicators->all_rank, communicators->all_size,
789  communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
790  collective_print_gather(cbuffer, communicators->all);
791  }
792 #endif
793 
794  /* shift working tasks */
795  if (communicators->work_size != -1) {
796  /* only if task in communicator work */
797  int newtasknr;
798  newtasknr=(communicators->work_rank+read_task_offset)%communicators->work_size;
799  MPI_Comm_split(communicators->work, 0, newtasknr, &communicators->workread);
800 
801  MPI_Comm_size(communicators->workread, &communicators->workread_size);
802  MPI_Comm_rank(communicators->workread, &communicators->workread_rank);
803  /* printf("WF: %d %d %% %d-> %d (%d %d)\n",
804  communicators->work_rank,read_task_offset,
805  communicators->work_size,newtasknr,
806  communicators->workread_rank,communicators->workread_size);*/
807 
808  } else {
809  /* this rtask will not be used for reading */
810  communicators->workread_size = communicators->workread_rank = -1;
811  communicators->local_size = communicators->local_rank = -1;
812  }
813 
814  return(1);
815 }
Sion Time Stamp Header.