CPUCruncher.cpp

#include "CPUCruncher.h"
#include "GaudiKernel/ThreadLocalContext.h"
#include "HiveNumbers.h"
#include <ctime>
#include <sys/resource.h>
#include <sys/times.h>

#include <tbb/tick_count.h>
#include <thread>

std::vector<unsigned int> CPUCruncher::m_niters_vect;
std::vector<double>       CPUCruncher::m_times_vect;
CPUCruncher::CHM          CPUCruncher::m_name_ncopies_map;

DECLARE_COMPONENT( CPUCruncher )

#define ON_DEBUG if ( msgLevel( MSG::DEBUG ) )
#define DEBUG_MSG ON_DEBUG debug()

#define ON_VERBOSE if ( msgLevel( MSG::VERBOSE ) )
#define VERBOSE_MSG ON_VERBOSE verbose()

//------------------------------------------------------------------------------

CPUCruncher::CPUCruncher( const std::string& name, // the algorithm instance name
                          ISvcLocator*       pSvc )
    : GaudiAlgorithm( name, pSvc )
{

  declareProperty( "NIterationsVect", m_niters_vect, "Number of iterations for the calibration." );
  declareProperty( "NTimesVect", m_times_vect, "Number of seconds for the calibration." );

  // Register the algo in the static concurrent hash map in order to
  // monitor the # of copies
  CHM::accessor name_ninstances;
  m_name_ncopies_map.insert( name_ninstances, name );
  name_ninstances->second += 1;
}

CPUCruncher::~CPUCruncher()
{
  for ( uint i = 0; i < m_inputHandles.size(); ++i ) delete m_inputHandles[i];

  for ( uint i = 0; i < m_outputHandles.size(); ++i ) delete m_outputHandles[i];
}

StatusCode CPUCruncher::initialize()
{
  auto sc = GaudiAlgorithm::initialize();
  if ( !sc ) return sc;

  if ( m_times_vect.size() == 0 ) calibrate();

  // if an algorithm was setup to sleep, for whatever period, it effectively becomes I/O-bound
  if ( m_sleepFraction != 0.0f ) setIOBound( true );

  // This is a bit ugly. There is no way to declare a vector of DataObjectHandles, so
  // we need to wait until initialize when we've read in the input and output key
  // properties, and know their size, and then turn them
  // into Handles and register them with the framework by calling declareProperty. We
  // could call declareInput/declareOutput on them too.

  int i = 0;
  for ( auto k : m_inpKeys ) {
    DEBUG_MSG << "adding input key " << k << endmsg;
    m_inputHandles.push_back( new DataObjectHandle<DataObject>( k, Gaudi::DataHandle::Reader, this ) );
    declareProperty( "dummy_in_" + std::to_string( i ), *( m_inputHandles.back() ) );
    i++;
  }
  i = 0;
  for ( auto k : m_outKeys ) {
    DEBUG_MSG << "adding output key " << k << endmsg;
    m_outputHandles.push_back( new DataObjectHandle<DataObject>( k, Gaudi::DataHandle::Writer, this ) );
    declareProperty( "dummy_out_" + std::to_string( i ), *( m_outputHandles.back() ) );
    i++;
  }

  return sc;
}

/*
Calibrate the crunching finding the right relation between max number to be searched and time spent.
The relation is a sqrt for times greater than 10^-4 seconds.
*/
void CPUCruncher::calibrate()
{
  m_niters_vect = {0,    500,  600,  700,  800,   1000,  1300,  1600,  2000,  2300,  2600,  3000,  3300,  3500, 3900,
                   4200, 5000, 6000, 8000, 10000, 12000, 15000, 17000, 20000, 25000, 30000, 35000, 40000, 60000};
  if ( !m_shortCalib ) {
    m_niters_vect.push_back( 100000 );
    m_niters_vect.push_back( 200000 );
  }

  m_times_vect.resize( m_niters_vect.size() );
  m_times_vect[0] = 0.;

  info() << "Starting calibration..." << endmsg;
  for ( unsigned int i = 1; i < m_niters_vect.size(); ++i ) {
    unsigned long niters = m_niters_vect[i];
    unsigned int  trials = 30;
    do {
      auto start_cali = tbb::tick_count::now();
      findPrimes( niters );
      auto   stop_cali = tbb::tick_count::now();
      double deltat    = ( stop_cali - start_cali ).seconds();
      m_times_vect[i]  = deltat;
      DEBUG_MSG << "Calibration: # iters = " << niters << " => " << deltat << endmsg;
      trials--;
    } while ( trials > 0 and m_times_vect[i] < m_times_vect[i - 1] ); // make sure that they are monotonic
  }
  info() << "Calibration finished!" << endmsg;
}

unsigned long CPUCruncher::getNCaliIters( double runtime )
{

  unsigned int smaller_i = 0;
  double       time      = 0.;
  bool         found     = false;
  // We know that the first entry is 0, so we start to iterate from 1
  for ( unsigned int i = 1; i < m_times_vect.size(); i++ ) {
    time = m_times_vect[i];
    if ( time > runtime ) {
      smaller_i = i - 1;
      found     = true;
      break;
    }
  }

  // Case 1: we are outside the interpolation range, we take the last 2 points
  if ( not found ) smaller_i = m_times_vect.size() - 2;

  // Case 2: we maeke a linear interpolation
  // y=mx+q
  const double x0 = m_times_vect[smaller_i];
  const double x1 = m_times_vect[smaller_i + 1];
  const double y0 = m_niters_vect[smaller_i];
  const double y1 = m_niters_vect[smaller_i + 1];
  const double m  = ( y1 - y0 ) / ( x1 - x0 );
  const double q  = y0 - m * x0;

  const unsigned long nCaliIters = m * runtime + q;
  // always() << x0 << "<" << runtime << "<" << x1 << " Corresponding to " << nCaliIters << " iterations" << endmsg;

  return nCaliIters;
}

void CPUCruncher::findPrimes( const unsigned long int n_iterations )
{
  // Flag to trigger the allocation
  bool is_prime;

  // Let's prepare the material for the allocations
  unsigned int   primes_size = 1;
  unsigned long* primes      = new unsigned long[primes_size];
  primes[0]                  = 2;

  unsigned long i = 2;

  // Loop on numbers
  for ( unsigned long int iiter = 0; iiter < n_iterations; iiter++ ) {
    // Once at max, it returns to 0
    i += 1;

    // Check if it can be divided by the smaller ones
    is_prime = true;
    for ( unsigned long j = 2; j < i && is_prime; ++j ) {
      if ( i % j == 0 ) is_prime = false;
    } // end loop on numbers < than tested one

    if ( is_prime ) {
      // copy the array of primes (INEFFICIENT ON PURPOSE!)
      unsigned int   new_primes_size = 1 + primes_size;
      unsigned long* new_primes      = new unsigned long[new_primes_size];

      for ( unsigned int prime_index = 0; prime_index < primes_size; prime_index++ ) {
        new_primes[prime_index] = primes[prime_index];
      }
      // attach the last prime
      new_primes[primes_size] = i;

      // Update primes array
      delete[] primes;
      primes      = new_primes;
      primes_size = new_primes_size;
    } // end is prime

  } // end of while loop

  // Fool Compiler optimisations:
  for ( unsigned int prime_index = 0; prime_index < primes_size; prime_index++ )
    if ( primes[prime_index] == 4 )
      debug() << "This does never happen, but it's necessary too fool aggressive compiler optimisations!" << endmsg;

  delete[] primes;
}

//------------------------------------------------------------------------------
void CPUCruncher::declareRuntimeRequestedOutputs()
{
  //
  for ( const auto& k : outputDataObjs() ) {
    auto outputHandle = new DataObjectHandle<DataObject>( k, Gaudi::DataHandle::Writer, this );
    VERBOSE_MSG << "found late-attributed output: " << outputHandle->objKey() << endmsg;
    m_outputHandles.push_back( outputHandle );
    declareProperty( "dummy_out_" + outputHandle->objKey(), *( m_outputHandles.back() ) );
  }

  initDataHandleHolder();

  m_declAugmented = true;
}

//------------------------------------------------------------------------------

StatusCode CPUCruncher::execute() // the execution of the algorithm
{

  if ( m_loader && !m_declAugmented ) declareRuntimeRequestedOutputs();

  float crunchtime;

  if ( m_local_rndm_gen ) {
    /* This will disappear with a thread safe random number generator service.
     * Use basic Box-Muller to generate Gaussian random numbers.
     * The quality is not good for in depth study given that the generator is a
     * linear congruent.
     * Throw away basically a free number: we are in a cpu cruncher after all.
     * The seed is taken from the clock, but we could assign a seed per module to
     * ensure reproducibility.
     *
     * This is not an overkill but rather an exercise towards a thread safe
     * random number generation.
     */

    auto getGausRandom = []( double mean, double sigma ) -> double {

      unsigned int seed = std::clock();

      auto getUnifRandom = []( unsigned int& seed ) -> double {
        // from "Numerical Recipes"
        constexpr unsigned int m = 232;
        constexpr unsigned int a = 1664525;
        constexpr unsigned int c = 1013904223;
        seed                     = ( a * seed + c ) % m;
        const double unif        = double( seed ) / m;
        return unif;
      };

      double unif1, unif2;
      do {
        unif1 = getUnifRandom( seed );
        unif2 = getUnifRandom( seed );
      } while ( unif1 == 0. );

      const double normal = sqrt( -2. * log( unif1 ) ) * cos( 2 * M_PI * unif2 );

      return normal * sigma + mean;
    };

    crunchtime = fabs( getGausRandom( m_avg_runtime * ( 1. - m_sleepFraction ), m_var_runtime ) );
    // End Of temp block
  } else {
    // Should be a member.
    HiveRndm::HiveNumbers rndmgaus( randSvc(), Rndm::Gauss( m_avg_runtime * ( 1. - m_sleepFraction ), m_var_runtime ) );
    crunchtime = std::fabs( rndmgaus() );
  }

  // Prepare to sleep (even if we won't enter the following if clause for sleeping).
  // This is needed to distribute evenly among all algorithms the overhead (around sleeping) which is harmful when
  // trying to achieve uniform distribution of algorithm timings.
  const double                        dreamtime = m_avg_runtime * m_sleepFraction;
  const std::chrono::duration<double> dreamtime_duration( dreamtime );
  tbb::tick_count                     startSleeptbb;
  tbb::tick_count                     endSleeptbb;

  // Start to measure the total time here, together with the dreaming process straight ahead
  tbb::tick_count starttbb = tbb::tick_count::now();
  // If the algorithm was set as I/O-bound, we will replace requested part of crunching with plain sleeping
  if ( isIOBound() ) {
    // in this block (and not in other places around) msgLevel is checked for the same reason as above, when
    // preparing to sleep several lines above: to reduce as much as possible the overhead around sleeping
    DEBUG_MSG << "Dreaming time will be: " << dreamtime << endmsg;

    ON_DEBUG startSleeptbb = tbb::tick_count::now();
    std::this_thread::sleep_for( dreamtime_duration );
    ON_DEBUG endSleeptbb = tbb::tick_count::now();

    // actual sleeping time can be longer due to scheduling or resource contention delays
    ON_DEBUG
    {
      const double actualDreamTime = ( endSleeptbb - startSleeptbb ).seconds();
      debug() << "Actual dreaming time was: " << actualDreamTime << "s" << endmsg;
    }
  } // end of "sleeping block"

  DEBUG_MSG << "Crunching time will be: " << crunchtime << endmsg;
  const EventContext& context = Gaudi::Hive::currentContext();
  DEBUG_MSG << "Start event " << context.evt() << " in slot " << context.slot() << " on pthreadID " << std::hex
            << pthread_self() << std::dec << endmsg;

  VERBOSE_MSG << "inputs number: " << m_inputHandles.size() << endmsg;
  for ( auto& inputHandle : m_inputHandles ) {
    if ( !inputHandle->isValid() ) continue;

    VERBOSE_MSG << "get from TS: " << inputHandle->objKey() << endmsg;
    DataObject* obj = nullptr;
    for ( unsigned int i = 0; i < m_rwRepetitions; ++i ) {
      obj = inputHandle->get();
    }
    if ( obj == nullptr ) error() << "A read object was a null pointer." << endmsg;
  }

  const unsigned long n_iters = getNCaliIters( crunchtime );
  findPrimes( n_iters );

  // Return error on fraction of events if configured
  if ( m_failNEvents > 0 && context.evt() > 0 && ( context.evt() % m_failNEvents ) == 0 ) {
    return StatusCode::FAILURE;
  }

  VERBOSE_MSG << "outputs number: " << m_outputHandles.size() << endmsg;
  for ( auto& outputHandle : m_outputHandles ) {
    if ( !outputHandle->isValid() ) continue;

    VERBOSE_MSG << "put to TS: " << outputHandle->objKey() << endmsg;
    outputHandle->put( new DataObject() );
  }

  tbb::tick_count endtbb = tbb::tick_count::now();

  const double actualRuntime = ( endtbb - starttbb ).seconds();

  DEBUG_MSG << "Finish event " << context.evt()
            //      << " on pthreadID " << context.m_thread_id
            << " in " << actualRuntime << " seconds" << endmsg;

  DEBUG_MSG << "Timing: ExpectedCrunchtime= " << crunchtime << " ExpectedDreamtime= " << dreamtime
            << " ActualTotalRuntime= " << actualRuntime << " Ratio= " << ( crunchtime + dreamtime ) / actualRuntime
            << " Niters= " << n_iters << endmsg;

  setFilterPassed( !m_invertCFD );

  return StatusCode::SUCCESS;
}

//------------------------------------------------------------------------------
StatusCode CPUCruncher::finalize() // the finalization of the algorithm
{
  MsgStream log( msgSvc(), name() );

  unsigned int ninstances;

  {
    CHM::const_accessor const_name_ninstances;
    m_name_ncopies_map.find( const_name_ninstances, name() );
    ninstances = const_name_ninstances->second;
  }

  constexpr double s2ms = 1000.;
  // do not show repetitions
  if ( ninstances != 0 ) {
    info() << "Summary: name= " << name() << "\t avg_runtime= " << m_avg_runtime * s2ms << "\t n_clones= " << ninstances
           << endmsg;

    CHM::accessor name_ninstances;
    m_name_ncopies_map.find( name_ninstances, name() );
    name_ninstances->second = 0;
  }

  return GaudiAlgorithm::finalize();
}

//------------------------------------------------------------------------------