/****************************************************************************
**  CUBE        http://www.scalasca.org/                                   **
*****************************************************************************
**  Copyright (c) 2015-2025                                                **
**  Forschungszentrum Juelich GmbH, Juelich Supercomputing Centre          **
**                                                                         **
**  This software may be modified and distributed under the terms of       **
**  a BSD-style license.  See the COPYING file in the package base         **
**  directory for details.                                                 **
****************************************************************************/


#include <config.h>
#include "POPAuditAnalysis.h"
#include "PerformanceTest.h"


using namespace cube;
using namespace mpianalysis;



POPAuditPerformanceAnalysis::POPAuditPerformanceAnalysis( cube::CubeProxy* _cube ) : popcalculation::PerformanceAnalysis( _cube )
{
    stalled_resources = new popcalculation::POPStalledResourcesTest( cube );
    wall              = new popcalculation::POPWallTimeTest( cube );
    ipc               = new popcalculation::POPIPCTest( cube );
    no_wait_ins       = new popcalculation::POPNoWaitINSTest( cube );
    comp              = new popcalculation::POPComputationTime( cube );
    gpu_comp          = new popcalculation::POPGPUComputationTime( cube );
    posix_io          = new popcalculation::POPPosixIOTime( cube );
    mpi_io            = new popcalculation::POPMpiIOTime( cube );
    io_eff            = new popcalculation::POPIOEfficiencyTest( posix_io, mpi_io );
    pop_ser_eff       = new POPSerialisationTest( cube );
    pop_transfer_eff  = new POPTransferTest( cube );
    comm_eff          = new POPCommunicationEfficiencyTest( cube, pop_ser_eff, pop_transfer_eff );
    lb_eff            = new POPImbalanceTest( cube );
    par_eff           = new POPParallelEfficiencyTest( lb_eff, comm_eff );

    gpu_comm_eff = new popcalculation::POPGPUCommunicationEfficiencyTest( cube );
    gpu_lb_eff   = new popcalculation::POPGPUImbalanceTest( cube );
    gpu_par_eff  = new popcalculation::POPGPUParallelEfficiencyTest( gpu_lb_eff, gpu_comm_eff );

    popcalculation::PerformanceTest::finalizePrepsForTest( _cube );

    max_ipc = ipc->getMaximum();
}


POPAuditPerformanceAnalysis::~POPAuditPerformanceAnalysis()
{
    delete gpu_comm_eff;
    delete gpu_lb_eff;
    delete gpu_par_eff;
    delete comm_eff;
    delete lb_eff;
    delete par_eff;
    delete stalled_resources;
    delete ipc;
    delete wall;
    delete no_wait_ins;
    delete comp;
    delete gpu_comp;
    delete posix_io;
    delete mpi_io;
    delete io_eff;
    delete pop_ser_eff;
    delete pop_transfer_eff;
};

std::list<popcalculation::PerformanceTest*>
POPAuditPerformanceAnalysis::getAllTestsForCalculation()
{
    std::list<popcalculation::PerformanceTest*> to_return;
    to_return.push_back( wall );
    to_return.push_back( stalled_resources );
    to_return.push_back( ipc );
    to_return.push_back( no_wait_ins );
    to_return.push_back( comp );
    to_return.push_back( gpu_comp );
    to_return.push_back( io_eff );
    to_return.push_back( gpu_par_eff );
    to_return.push_back( par_eff );
    return to_return;
}

std::list<popcalculation::PerformanceTest*>
POPAuditPerformanceAnalysis::getPOPTests()
{
    std::list<popcalculation::PerformanceTest*> to_return;
    to_return.push_back( par_eff );
    to_return.push_back( lb_eff );
    to_return.push_back( comm_eff );
    to_return.push_back( pop_ser_eff );
    to_return.push_back( pop_transfer_eff );
    return to_return;
}
std::list<popcalculation::PerformanceTest*>
POPAuditPerformanceAnalysis::getGPUTests()
{
    std::list<popcalculation::PerformanceTest*> to_return;
    to_return.push_back( gpu_par_eff );
    to_return.push_back( gpu_lb_eff );
    to_return.push_back( gpu_comm_eff );
    return to_return;
}

std::list<popcalculation::PerformanceTest*>
POPAuditPerformanceAnalysis::getIOTests()
{
    std::list<popcalculation::PerformanceTest*> to_return;
    to_return.push_back( io_eff );
    to_return.push_back( posix_io );
    to_return.push_back( mpi_io );
    return to_return;
}

std::list<popcalculation::PerformanceTest*>
POPAuditPerformanceAnalysis::getAdditionalTests()
{
    std::list<popcalculation::PerformanceTest*> to_return;
    to_return.push_back( stalled_resources );
    to_return.push_back( ipc );
    to_return.push_back( no_wait_ins );
    to_return.push_back( comp );
    to_return.push_back( gpu_comp );
    return to_return;
}

std::list<popcalculation::PerformanceTest*>
POPAuditPerformanceAnalysis::getControlTests()
{
    std::list<popcalculation::PerformanceTest*> to_return;
    to_return.push_back( wall );
    return to_return;
}



// ------ overview tests ---------

bool
POPAuditPerformanceAnalysis::isActive() const
{
    cube::Metric* omp_met = cube->getMetric( "omp_time" );
#ifdef HAVE_CUBELIB_DEBUG
    if ( !omp_met->isInactive() )
    {
        std::cerr << "[WARNING] Profile contains OpenMP metrics. MPI POP Analysis won't deliver correct result. Please use \"hybrid_add\", \"hybrid_mult\" or \"bsc\" instead." << std::endl;
    }
#endif
    return ( omp_met->isInactive() ) && ( gpu_par_eff->isActive() || par_eff->isActive() || stalled_resources->isActive() || no_wait_ins->isActive() || comp->isActive() || mpi_io->isActive() || posix_io->isActive() || io_eff->isActive() );
}

std::string
POPAuditPerformanceAnalysis::getAnchorHowToMeasure()
{
    std::string text =
        "Attempting to optimize the performance of a parallel code can be a daunting task, and often it is difficult to know where to start. For example, we might ask if the way computational work is divided is a problem? Or perhaps the chosen communication scheme is inefficient? Or does something else impact performance? To help address this issue, POP has defined a methodology for analysis of parallel codes to provide a quantitative way of measuring relative impact of the different factors inherent in parallelization. This article introduces these metrics, explains their meaning, and provides insight into the thinking behind them.\n\n"
        "A feature of the methodology is, that it uses a hierarchy of Only-MPI Assessment, each metric reflecting a common cause of inefficiency in parallel programs. These metrics then allow a comparison of the parallel performance (e.g. over a range of thread/process counts, across different machines, or at different stages of optimization and tuning) to identify which characteristics of the code contribute to the inefficiency. \n\n"
        "The first step to calculating these metrics is to use a suitable tool (e.g. Score-P or Extrae) to generate trace data whilst the code is executed. The traces contain information about the state of the code at a particular time, e.g. it is in a communication routine or doing useful computation, and also contains values from processor hardware counters, e.g. number of instructions executed, number of cycles.\n\n"
        "The Only-MPI Assessment are then calculated as efficiencies between 0 and 1, with higher numbers being better. In general, we regard efficiencies above 0.8 as acceptable, whereas lower values indicate performance issues that need to be explored in detail. The ultimate goal then for POP is rectifying these underlying issues by the user. Please note, that Only-MPI Assessment can be computed only for inclusive callpaths, as they are less meaningful for exclusive callpaths. Furthermore, Only-MPI Assessment are not available in \"Flat view\" mode.\n\n"
        "The approach outlined here is applicable to various parallelism paradigms, however for simplicity the Only-MPI Assessment presented here are formulated in terms of a distributed-memory message-passing environment, e.g., MPI. For this the following values are calculated for each process from the trace data: time doing useful computation, time in communication, number of instructions & cycles during useful computation. Useful computation excludes time within the overhead of parallel paradigms (Computation time).\n\n"
        "At the top of the hierarchy is Global Efficiency (GE), which we use to judge overall quality of parallelization. Typically, inefficiencies in parallel code have two main sources:\n"
        "\t# Overhead imposed by the parallel nature of a code\n"
        "\t# Poor scaling of computation with increasing numbers of processes\n\n"
        "and to reflect this we define two sub-metrics to measure these two inefficiencies. These are the Parallel Efficiency and the Computation Efficiency, and our top-level GE metric is the product of these two sub-metrics:\n\t"
        "GE = Parallel Efficiency x Computation Efficiency";
    return text;
}
