21 #include "ompi_partest.h"    22 #include "partest_opts.h"    29 #include <common/bgp_personality.h>    30 #include <common/bgp_personality_inlines.h>    55 int barrier_after_start(MPI_Comm comm)
    65 int barrier_after_malloc(MPI_Comm comm)
    76 int barrier_after_open(MPI_Comm comm)
    86 int barrier_after_write(MPI_Comm comm)
    96 int barrier_after_read(MPI_Comm comm)
   106 int barrier_after_close(MPI_Comm comm)
   116 int barrier_before_unlink(MPI_Comm comm)
   126 int barrier_after_unlink(MPI_Comm comm)
   136 static char * __collective_print_pointer;
   138 int collective_print_gather(
char *cbuffer, MPI_Comm comm )
   142   int       num_threads = omp_get_num_threads();
   148           __collective_print_pointer = (
char *) malloc(MAXCHARLEN * num_threads);
   153      memcpy(__collective_print_pointer+(MAXCHARLEN * omp_get_thread_num()),cbuffer,MAXCHARLEN);
   159       MPI_Comm_size(comm, &size);
   160       MPI_Comm_rank(comm, &rank);
   161       if(rank==0) lbuffer = (
char *) malloc(MAXCHARLEN * num_threads * size);
   164       MPI_Gather(__collective_print_pointer, MAXCHARLEN*num_threads, MPI_CHAR, lbuffer, MAXCHARLEN*num_threads, MPI_CHAR, 0, comm);
   169         for (p = 0; p < (size*num_threads); p++) {
   171           fprintf(stderr, 
"%s", lbuffer + p * MAXCHARLEN);
   175       if(rank==0) free(lbuffer);
   177       free(__collective_print_pointer);
   194 static void * __thread_sync_pointer;
   196 void reduce_omp(
void *syncdata, 
void * out, MPI_Op op, 
int dtype)
   198     int thread_num = omp_get_thread_num();
   199     int num_threads = omp_get_num_threads();
   206           case _PARTEST_SION_INT32:
   208                   __thread_sync_pointer = malloc(
sizeof(sion_int32)*num_threads);
   211           case _PARTEST_SION_INT64:
   213                   __thread_sync_pointer = malloc(
sizeof(sion_int64)*num_threads);
   216           case _PARTEST_DOUBLE:
   218                   __thread_sync_pointer = malloc(
sizeof(
double)*num_threads);
   223                   __thread_sync_pointer = malloc(
sizeof(sion_int64)*num_threads);
   232         case _PARTEST_SION_INT32:
   234                 ((sion_int32 *)__thread_sync_pointer)[thread_num] = * (sion_int32*)syncdata;
   237         case _PARTEST_SION_INT64:
   239                 ((sion_int64 *)__thread_sync_pointer)[thread_num] = *((sion_int64*)syncdata);
   242         case _PARTEST_DOUBLE:
   244                 ((
double *)__thread_sync_pointer)[thread_num] = *(
double*)syncdata;
   249                 ((sion_int64 *)__thread_sync_pointer)[thread_num] = *(sion_int64*)syncdata;
   261             case _PARTEST_SION_INT32:
   264                         *((sion_int32 *) out) = 0;
   265                         for(i=0;i<num_threads;i++){
   266                             *((sion_int32 *) out) += ((sion_int32 *)__thread_sync_pointer)[i];
   268                     }
else if(op == MPI_MAX){
   269                         *((sion_int32 *) out) = ((sion_int32 *)__thread_sync_pointer)[0];
   270                         for(i=1;i<num_threads;i++){
   271                             if(((sion_int32 *)__thread_sync_pointer)[i] > *((sion_int32 *) out)) *((sion_int32 *) out) = ((sion_int32 *)__thread_sync_pointer)[i];
   276             case _PARTEST_SION_INT64:
   279                             *((sion_int64 *) out) = 0;
   280                             for(i=0;i<num_threads;i++){
   281                                 *((sion_int64 *) out) += ((sion_int64 *)__thread_sync_pointer)[i];
   283                         }
else if(op == MPI_MAX){
   284                             *((sion_int64 *) out) = ((sion_int64 *)__thread_sync_pointer)[0];
   285                             for(i=1;i<num_threads;i++){
   286                                 if(((sion_int64 *)__thread_sync_pointer)[i] > *((sion_int64 *) out)) *((sion_int64 *) out) = ((sion_int64 *)__thread_sync_pointer)[i];
   291             case _PARTEST_DOUBLE:
   294                             *((
double *) out) = 0;
   295                             for(i=0;i<num_threads;i++){
   296                                 *((
double *) out) += ((
double *)__thread_sync_pointer)[i];
   298                         }
else if(op == MPI_MAX){
   299                             *((
double *) out) = ((
double *)__thread_sync_pointer)[0];
   300                             for(i=1;i<num_threads;i++){
   301                                 if(((
double *)__thread_sync_pointer)[i] > *((
double *) out)) *((
double *) out) = ((
double *)__thread_sync_pointer)[i];
   309                             *((sion_int64 *) out) = 0;
   310                             for(i=0;i<num_threads;i++){
   311                                 *((sion_int64 *) out) += ((sion_int64 *)__thread_sync_pointer)[i];
   313                         }
else if(op == MPI_MAX){
   314                             *((sion_int64 *) out) = ((sion_int64 *)__thread_sync_pointer)[0];
   315                             for(i=1;i<num_threads;i++){
   316                                 if(((sion_int64 *)__thread_sync_pointer)[i] > *((sion_int64 *) out)) *((sion_int64 *) out) = ((sion_int64 *)__thread_sync_pointer)[i];
   322         free(__thread_sync_pointer);
   327 int split_communicator(
_test_communicators * communicators, 
int bluegene, 
int bluegene_np, 
int numfiles, 
int read_task_offset, 
int verbose);
   330 int main(
int argc, 
char **argv)
   335   sion_int64 commwork_size64 = 1;
   343   MPI_Init(&argc, &argv);
   344   MPI_Comm_size(MPI_COMM_WORLD, &size);
   345   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   346    DPRINTFTS(rank, 
"after MPI_Init");
   352   init_options(&options);
   356     parse_options_std(argc, argv, &options);
   361     rc=parse_options_long(argc, argv, &options);
   367   MPI_Bcast(&rc, 1, MPI_INT, 0, MPI_COMM_WORLD);
   369     MPI_Abort(MPI_COMM_WORLD, 1);
   372   distribute_options_mpi(&options);
   376   communicators.
all = MPI_COMM_WORLD;
   377   MPI_Comm_size(MPI_COMM_WORLD, &communicators.all_size);
   378   MPI_Comm_rank(MPI_COMM_WORLD, &communicators.all_rank);
   379   split_communicator(&communicators, options.bluegene, options.bluegene_np, options.numfiles, options.read_task_offset, options.verbose);
   382   sion_int64 num_threads = (sion_int64) omp_get_max_threads();
   383   MPI_Allreduce(&num_threads, &commwork_size64, 1, SION_MPI_INT64, MPI_SUM, communicators.
work);
   385   if (options.globalsize > 0) {
   386     options.totalsize = (sion_int64) options.globalsize / commwork_size64;
   389     options.globalsize = options.totalsize * commwork_size64;
   392   if((options.totalsize>options.bufsize) || (options.read_task_offset>0)  || (options.do_write==0)) {
   393     options.suppress_checksum=1;
   396   if(options.fsblksize<0) options.fsblksize=-1;
   398   if ( (communicators.work_size>0) && (communicators.work_rank==0) ) {
   400     fprintf(stderr, 
"------------------------------------------------------------------------------------------\n");
   401     fprintf(stderr, 
"SION parallel file I/O benchmark 'ompi_partest': start at %s", ctime(&t));
   402     fprintf(stderr, 
"partest Number of MPI tasks that will use the file tasks:       running on %d tasks\n", size);
   403     fprintf(stderr, 
"------------------------------------------------------------------------------------------\n");
   405     fprintf(stderr, 
"partest parameter:   CHECKSUM DISABLED!\n\n");
   407     if(options.suppress_checksum) {
   408       fprintf(stderr, 
"partest parameter:   CHECKSUM not possible, DISABLED!\n\n");
   411     fprintf(stderr, 
"partest parameter:   (-f)    datafile                 = %s\n", options.filename);
   412     fprintf(stderr, 
"partest parameter:   (-n)    number of files          = %d\n", options.numfiles);
   413     fprintf(stderr, 
"partest parameter:   (-F)    random factor            = %13.4f\n", options.factor);
   414     fprintf(stderr, 
"partest parameter:   (-X)    remove files after test  = %d\n", options.unlink_files);
   415     fprintf(stderr, 
"partest parameter:   (-b/-B) local buffer size / sion task = %15lld bytes %10.3f MB\n", options.bufsize, options.bufsize / (1.0 MB));
   416     fprintf(stderr, 
"partest parameter:   (-g/-G) global total data size   = %15lld bytes %10.3f GB\n", options.globalsize, options.globalsize / (1024.0 MB));
   417     fprintf(stderr, 
"partest parameter:   (-s/-S) total data size / sion task   = %15lld bytes %10.3f MB\n", options.totalsize, options.totalsize / (1.0 MB));
   418     fprintf(stderr, 
"partest parameter:   (-r/-R) sion chunk size          = %15lld bytes %10.3f MB\n", options.chunksize, options.chunksize / (1.0 MB));
   419     fprintf(stderr, 
"partest parameter:   (-Q)    fs block size            = %15d bytes %10.3f MB\n", options.fsblksize, options.fsblksize / (1.0 MB));
   420     if (options.type == 0)
   421       fprintf(stderr, 
"partest parameter:   (-T)    test type                = %d (sion OMPI, collective read)\n", options.type);
   422     if (options.type == 1)
   423       fprintf(stderr, 
"partest parameter:   (-T)    test type                = %d (sion OMPI, independant read)\n", options.type);
   424     if (options.type == 2)
   425       fprintf(stderr, 
"partest parameter:   (-T)    test type                = %d (sion OMP, collective read)\n", options.type);
   426     if (options.type == 3)
   427       fprintf(stderr, 
"partest parameter:   (-T)    test type                = %d (sion OMP, independant read)\n", options.type);
   428     fprintf(stderr, 
"partest parameter:   (-j)    serialize_blocknum       = %d\n", options.serialize_blocknum);
   429     fprintf(stderr, 
"partest parameter:   (-Z)    read task offset         = %d\n", options.read_task_offset);
   430     fprintf(stderr, 
"partest parameter:   (-o)    start offset bytes       = %d\n", options.startoffset);
   431     fprintf(stderr, 
"partest parameter:   (-v)    verbose                  = %d\n", options.verbose);
   432     fprintf(stderr, 
"partest parameter:   (-d)    debug                    = %d\n", options.debug);
   433     fprintf(stderr, 
"partest parameter:   (-D)    Debug                    = %d\n", options.Debug);
   434     fprintf(stderr, 
"partest parameter:   (-M)    collective write         = %d\n", options.collectivewrite);
   435     fprintf(stderr, 
"partest parameter:   (-m)    collective read          = %d\n", options.collectiveread);
   438     fprintf(stderr, 
"partest parameter:   (-P)    Blue Gene, I/O nodes     = %d\n", options.bluegene);
   439     fprintf(stderr, 
"partest parameter:   ()      Blue Gene: tasks/IO-node = %d\n", options.bluegene_np);
   440     fprintf(stderr, 
"partest parameter:   (-w)    MPI-IO, IBM, Large Block IO = %d\n", options.mpiio_lb);
   441     fprintf(stderr, 
"partest parameter:   (-W)    MPI-IO, IBM, IO bufsize     = %d KB\n", options.mpiio_bs);
   442     fprintf(stderr, 
"partest parameter:   (-x)    MPI-IO, IBM, sparse access  = %d\n", options.mpiio_sa);
   443     fprintf(stderr, 
"partest parameter:   (  )    OpenMP number of threads    = %d\n", omp_get_max_threads());
   444     fprintf(stderr, 
"partest parameter:   (  )    commwork_size64             = %lld\n", commwork_size64);
   445     fprintf(stderr, 
"partest parameter:   (  )    suppress_checksum           = %d\n", options.suppress_checksum);
   446     fprintf(stderr, 
"partest parameter:   (  )    do_write                    = %d\n", options.do_write);
   447     fprintf(stderr, 
"partest parameter:   (  )    do_read                     = %d\n", options.do_read);
   448     fprintf(stderr, 
"partest parameter:   (  )    use_posix                   = %d\n", options.use_posix);
   452   barrier_after_start(MPI_COMM_WORLD);
   454   if ( (communicators.work_size>0) && (communicators.work_rank==0) ) {
   455     fprintf(stderr, 
"partest parameter:   (  )    comm(all)                   = %d of %d\n", communicators.all_rank, communicators.all_size);
   456     fprintf(stderr, 
"partest parameter:   (  )    comm(work)                  = %d of %d\n", communicators.work_rank, communicators.work_size);
   457     fprintf(stderr, 
"partest parameter:   (  )    comm(local)                 = %d of %d\n", communicators.local_rank, communicators.local_size);
   458     fprintf(stderr, 
"------------------------------------------------------------------------------------------\n");
   460     #pragma omp parallel private(localbuffer,t)   462          barrier_after_start(MPI_COMM_WORLD);
   463           char l_filename[MAXCHARLEN];
   464           strcpy(l_filename,options.filename);
   468            DPRINTFTS(rank, 
"after pstart");
   470           localbuffer = (
char *) malloc(options.bufsize);
   472           srand(time(NULL)*local_communicators.work_rank*omp_get_thread_num());
   477           memset (localbuffer, 
'a'+rank%26, local_options.bufsize);
   478           barrier_after_malloc(MPI_COMM_WORLD);
   479            DPRINTFTS(rank, 
"after malloc");
   483           if(local_options.factor>0.0) {
   484             if((local_options.collectivewrite) || (local_options.collectiveread)) {
   485               if(local_options.bufsize<local_options.totalsize*(1+local_options.factor)) {
   488                       fprintf(stderr, 
"partest: ERROR deadlock possible if collective read/write and random factor used, and buffer is too small aborting\n");
   489                       MPI_Abort(MPI_COMM_WORLD,0);
   496             local_options.totalsize += ((sion_int64) (local_options.factor * (sion_int64) local_options.totalsize * (sion_int64) rand() / (sion_int64) RAND_MAX));
   497             local_options.chunksize += ((sion_int64) (local_options.factor * (sion_int64) local_options.totalsize * (sion_int64) rand() / (sion_int64) RAND_MAX));
   498             fprintf(stderr, 
"partest parameter:   (  )    new totalsize[%4d,t%4d]          = %lld\n", local_communicators.work_rank,omp_get_thread_num(),local_options.totalsize);
   503            DPRINTFTS(rank, 
"before scall2");
   506               if ( (local_communicators.work_size>0) && (local_communicators.work_rank==0) ) {
   507                 fprintf(stderr, 
"partest parameter:   (  )    new totalsize                = %lld\n", local_options.totalsize);
   511           barrier_after_malloc(MPI_COMM_WORLD);
   512           if (local_options.type == 0) {
   513               local_options.collectiveopenforread = 1;
   514               test_paropen_multi_ompi(l_filename, localbuffer, &local_communicators, &local_options);
   516           }
else if(local_options.type == 1) {
   517               local_options.collectiveopenforread = 0;
   518               test_paropen_multi_ompi(l_filename, localbuffer, &local_communicators, &local_options);
   519           }
else if(local_options.type == 2) {
   520               local_options.collectiveopenforread = 1;
   521               test_paropen_omp(l_filename, localbuffer, &local_communicators, &local_options);
   522           }
else if(local_options.type == 3) {
   523               local_options.collectiveopenforread = 0;
   524               test_paropen_omp(l_filename, localbuffer, &local_communicators, &local_options);
   527            DPRINTFTS(rank, 
"before MPI_Finalize");
   528           barrier_after_malloc(MPI_COMM_WORLD);
   531               if ( (local_communicators.work_size>0) && (local_communicators.work_rank==0) ) {
   533                   fprintf(stderr, 
"SION parallel file I/O benchmark 'ompi_partest': end at %s\n", ctime(&t));
   544 int split_communicator(
_test_communicators * communicators, 
int bluegene, 
int bluegene_np, 
int numfiles, 
int read_task_offset, 
int verbose)
   548   communicators->work_size = communicators->work_rank = -2;
   549   communicators->local_size = communicators->local_rank = -2;
   555     _BGP_Personality_t personality;
   556     MPI_Comm  commSame, commDiff, commTemp;
   557     int       sizeSame, sizeDiff;
   558     int       rankSame, rankDiff;
   560     unsigned  procid, x, y, z, t;
   561     char      cbuffer[MAXCHARLEN];
   564     Kernel_GetPersonality(&personality, 
sizeof(personality));
   565     BGP_Personality_getLocationString(&personality, location);
   566     procid = Kernel_PhysicalProcessorID();
   567     MPIX_rank2torus(communicators->all_rank, &x, &y, &z, &t);
   570     MPIX_Pset_diff_comm_create(&commDiff);
   571     MPI_Comm_size(commDiff, &sizeDiff);
   572     MPI_Comm_rank(commDiff, &rankDiff);
   573     communicators->ionode_number = rankDiff;
   576     MPIX_Pset_same_comm_create(&commSame);
   577     MPI_Comm_size(commSame, &sizeSame);
   578     MPI_Comm_rank(commSame, &rankSame);
   581     if (bluegene_np == 0) {
   582       bluegene_np = sizeSame;
   586     MPI_Comm_split(communicators->
all, (rankSame < bluegene_np), communicators->all_rank, &communicators->
work);
   587     MPI_Comm_size(communicators->
work, &communicators->work_size);
   588     MPI_Comm_rank(communicators->
work, &communicators->work_rank);
   589     if (rankSame >= bluegene_np) {
   591       communicators->work_size = communicators->work_rank = -1;
   592       communicators->local_size = communicators->local_rank = -1;
   598       communicators->
local = communicators->
work;
   600     else if(numfiles<0) {
   603     MPI_Comm_split(commSame, (rankSame < bluegene_np), communicators->all_rank, &communicators->
local);
   607     communicators->
local=commDiff;
   610     MPI_Comm_size(communicators->
local, &communicators->local_size);
   611     MPI_Comm_rank(communicators->
local, &communicators->local_rank);
   616       if(numfiles==-1)  communicators->file_number = rankDiff;
   617       else              communicators->file_number = rankSame;
   620       communicators->file_number = -1;
   624     sprintf(cbuffer, 
"");
   625     if (rankSame < bluegene_np) {
   627         sprintf(cbuffer, 
"BGP[%05d] diff_comm: %4d of %4d  same_comm: %5d of %5d file_comm: %5d of %5d %s phys_xyzt(%d,%d,%d,%d)\n",
   628                 communicators->all_rank, rankDiff + 1, sizeDiff, rankSame + 1, sizeSame, communicators->local_rank + 1, communicators->local_size,
   629                 location, x, y, z, t);
   632     collective_print_gather(cbuffer, communicators->
work);
   639     _BGP_Personality_t personality;
   640     MPI_Comm  commSame, commDiff, commTemp;
   641     int       sizeSame, sizeDiff;
   642     int       rankSame, rankDiff;
   643     char      location[BGLPERSONALITY_MAX_LOCATION];
   644     unsigned  procid, x, y, z, t;
   645     BGLPersonality personality;
   646     char      cbuffer[MAXCHARLEN];
   649     rts_get_personality(&personality, 
sizeof(personality));
   650     BGLPersonality_getLocationString(&personality, location);
   651     x = BGLPersonality_xCoord(&personality),
   652       y = BGLPersonality_yCoord(&personality), z = BGLPersonality_zCoord(&personality), t = rts_get_processor_id()
   655       PMI_Pset_diff_comm_create(&commDiff);
   656     MPI_Comm_size(commDiff, &sizeDiff);
   657     MPI_Comm_rank(commDiff, &rankDiff);
   658     *ionode_number = rankDiff;
   661     PMI_Pset_same_comm_create(&commSame);
   662     MPI_Comm_size(commSame, &sizeSame);
   663     MPI_Comm_rank(commSame, &rankSame);
   666     if (bluegene_np == 0) {
   667       bluegene_np = sizeSame;
   671     MPI_Comm_split(communicators->
all, (rankSame < bluegene_np), communicators->all_rank, &communicators->
work);
   672     MPI_Comm_size(communicators->
work, &communicators->work_size);
   673     MPI_Comm_rank(communicators->
work, &communicators->work_rank);
   678       communicators->
local = communicators->
work;
   682       MPI_Comm_split(commSame, (rankSame < bluegene_np), communicators->all_rank, &communicators->
local);
   684     MPI_Comm_size(communicators->
local, &communicators->local_size);
   685     MPI_Comm_rank(communicators->
local, &communicators->local_rank);
   690       communicators->file_number = rankDiff;
   693       communicators->file_number = -1;
   697     sprintf(cbuffer, 
"");
   698     if (rankSame < bluegene_np) {
   700         sprintf(cbuffer, 
"BGL[%05d] diff_comm: %4d of %4d  same_comm: %5d of %5d file_comm: %5d of %5d %s phys_xyzt(%d,%d,%d,%d)\n",
   701                 communicators->all_rank, rankDiff + 1, sizeDiff, rankSame + 1, sizeSame,
   702                 communicators->local_rank + 1, communicators->local_size, location, x, y, z, t);
   705     collective_print_gather(cbuffer, communicators->
work);
   715   if (communicators->work_size == -2) {
   717     communicators->
work = communicators->
all;
   718     MPI_Comm_size(communicators->
work, &communicators->work_size);
   719     MPI_Comm_rank(communicators->
work, &communicators->work_rank);
   722   if (communicators->local_size == -2) {
   724       communicators->
local = communicators->
work;
   728       numfiles = communicators->work_size / 2;
   732     proc_per_file = communicators->work_size / numfiles;
   735     if (communicators->work_rank >= (numfiles * proc_per_file)) {
   736       communicators->file_number = numfiles - 1;
   739       communicators->file_number = communicators->work_rank / proc_per_file;
   742     MPI_Comm_split(communicators->
work, communicators->file_number, communicators->all_rank, &communicators->
local);
   744     MPI_Comm_size(communicators->
local, &communicators->local_size);
   745     MPI_Comm_rank(communicators->
local, &communicators->local_rank);
   747     communicators->ionode_number = communicators->file_number;
   754     gethostname(location, 256);
   755     char      cbuffer[MAXCHARLEN];
   756     sprintf(cbuffer, 
"LINUX[%03d] diff_comm: %4d of %4d  same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
   757             communicators->all_rank, communicators->all_rank, communicators->all_size,
   758             communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
   759     collective_print_gather(cbuffer, communicators->
all);
   768     gethostname(location, 256);
   769     int       sizeSame = 0, sizeDiff = 0;
   770     int       rankSame, rankDiff;
   771     char      cbuffer[MAXCHARLEN];
   772     sprintf(cbuffer, 
"AIX[%03d] diff_comm: %4d of %4d  same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
   773             communicators->all_rank, communicators->all_rank, communicators->all_size,
   774             communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
   775     collective_print_gather(cbuffer, communicators->
all);
   783     gethostname(location, 256);
   784     int       sizeSame = 0, sizeDiff = 0;
   785     int       rankSame, rankDiff;
   786     char      cbuffer[MAXCHARLEN];
   787     sprintf(cbuffer, 
"XT[%03d] diff_comm: %4d of %4d  same_comm: %4d of %4d file_comm: %4d of %4d %s\n",
   788             communicators->all_rank, communicators->all_rank, communicators->all_size,
   789             communicators->work_rank, communicators->work_size, communicators->local_rank, communicators->local_size, location);
   790     collective_print_gather(cbuffer, communicators->
all);
   795   if (communicators->work_size != -1) {
   798     newtasknr=(communicators->work_rank+read_task_offset)%communicators->work_size;
   799     MPI_Comm_split(communicators->
work, 0, newtasknr, &communicators->
workread);
   801     MPI_Comm_size(communicators->
workread, &communicators->workread_size);
   802     MPI_Comm_rank(communicators->
workread, &communicators->workread_rank);
   810     communicators->workread_size = communicators->workread_rank = -1;
   811     communicators->local_size = communicators->local_rank = -1;