/*******************************************************************************
* Copyright (C) 2014 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
 @file main.cpp

 HPCG routine
 */

// Main routine of a program that calls the HPCG conjugate gradient
// solver to solve the problem, and then prints results.

#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif

#include <fstream>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#ifdef HPCG_DETAILED_DEBUG
using std::cin;
#endif
//using std::endl;
#include <vector>
#include <tuple>
#include <utility>
#include <map>

#include "hpcg.hpp"

#include "CheckAspectRatio.hpp"
#include "GenerateGeometry.hpp"
#include "GenerateProblem.hpp"
#include "GenerateCoarseProblem.hpp"
#include "SetupHalo.hpp"
#include "CheckProblem.hpp"
#include "ExchangeHalo.hpp"
#include "OptimizeProblem.hpp"
#include "WriteProblem.hpp"
#include "ReportResults.hpp"
#include "mytimer.hpp"
#include "ComputeSPMV_ref.hpp"
#include "ComputeMG_ref.hpp"
#include "ComputeResidual.hpp"
#include "CG.hpp"
#include "CG_ref.hpp"
#include "Geometry.hpp"
#include "SparseMatrix.hpp"
#include "Vector.hpp"
#include "CGData.hpp"
#include "TestCG.hpp"
#include "TestSymmetry.hpp"
#include "TestNorms.hpp"
#include "UsmUtil.hpp"
#include "VeryBasicProfiler.hpp"

#include <cmath>
#include <cfloat>


#if defined(HPCG_QUICK_RUN)

int run_quick_path( sycl::queue &main_queue, int rank, int size, HPCG_Params &params) {

  if (rank == 0) {
    std::cout << "###########################################################################" << std::endl;
    std::cout << "########## Performing Quick Run for CG                         ############" << std::endl;
    std::cout << "###########################################################################" << std::endl;
  }

  local_int_t nx,ny,nz;
  nx = (local_int_t)params.nx;
  ny = (local_int_t)params.ny;
  nz = (local_int_t)params.nz;
  int ierr = 0;  // Used to check return codes on function calls

  ierr = CheckAspectRatio(0.125, nx, ny, nz, "local problem", rank==0);
  if (ierr)
    return ierr;

  /////////////////////////
  // Problem setup Phase //
  /////////////////////////

#ifdef HPCG_DEBUG
  double t1 = mytimer();
  if (rank == 0) std::cout << "Problem Setup Phase " << std::endl;
#endif

  // Construct the geometry and linear system
  Geometry * geom = new Geometry;
  GenerateGeometry(size, rank, params.numThreads, params.pz, params.zl, params.zu, nx, ny, nz, params.npx, params.npy, params.npz, geom, main_queue);
  ierr = CheckAspectRatio(0.125, geom->npx, geom->npy, geom->npz, "process grid", rank==0);
  if (ierr)
    return ierr;

  // Use this array for collecting timing information
  std::vector< double > times(10,0.0);
  double setup_time = mytimer();

  SparseMatrix A;
  InitializeSparseMatrix(A, geom);
  Vector b, x, xexact;
  GenerateProblem(A, &b, &x, &xexact, main_queue, params.runRealRef);
  SetupHalo(A, main_queue);
  int numberOfMgLevels = 4; // Number of levels including first
  SparseMatrix * curLevelMatrix = &A;
  for (int level = 1; level< numberOfMgLevels; ++level) {
      GenerateCoarseProblem(*curLevelMatrix, main_queue, params.runRealRef);
      curLevelMatrix = curLevelMatrix->Ac; // Make the just-constructed coarse grid the next level
  }

  local_int_t nrow = A.localNumberOfRows;
  local_int_t ncol = A.localNumberOfColumns;

  double t7 = 0.0;
  const bool need_MKL_matrix = false;
  OptimizeProblem(&A, &b, t7, need_MKL_matrix, main_queue);
  if (rank == 0) {
      std::cout << "OptimizeProblem took " << t7 << " seconds" << std::endl;
  }

  ////////////////////////////////////////////
  // Custom Kernel Validation Testing Phase //
  ////////////////////////////////////////////

  CGData data;
  InitializeSparseCGData(A, data, main_queue, params.runRealRef);

  int niters = 0;
  double normr = 0.0;
  double normr0 = 0.0;
  double tolerance = 1e-6;

  std::vector< double > opt_times(9,0.0);

  ////////////////////////////////////////////////////////////
  // These can be set to customize your desired quick run
  int numCgSetsForWarmup        = 1;  // can select 0+ warmup sets of CG
  int maxItersPerCgSetForWarmup = 3;  // can select 0+ warmup runs per CG set
  int numCgSets                 = 1;  // normally selected to match time input
  int maxItersPerCgSet          = 50; // normally 50+ runs per cg set
  // End of quick run set configuration
  ////////////////////////////////////////////////////////////

  if (rank == 0) {
    std::cout << "Running configuration: " << std::endl;
    std::cout << "    numberCgSetsForWarmup     = " << numCgSetsForWarmup << std::endl;
    std::cout << "    maxItersPerCgSetForWarmup = " << maxItersPerCgSetForWarmup << std::endl;
    std::cout << "    numCgSets                 = " << numCgSets << std::endl;
    std::cout << "    maxItersPerCgSet          = " << maxItersPerCgSet << std::endl;

  }


  if (rank == 0) {
    std::cout << "Starting Warmup CG Runs..." << std::endl;
  }
  for (int i = 0; i < numCgSetsForWarmup; ++i) {
    ZeroVector(x, main_queue).wait(); // Zero out x
#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif
    ierr = CG( A, data, b, x, maxItersPerCgSetForWarmup, tolerance, niters, normr, normr0, &times[0], true, main_queue);
    if (ierr) HPCG_fout << "Error in call to CG: " << ierr << ".\n" << std::endl;
    if (rank==0) {
        HPCG_fout << "Warmup Call [" << i << "] with " << niters <<  " iters, Scaled Residual [" << normr/normr0 << "]" << std::endl;
    }
  }

  if (rank == 0) {
    std::cout << "Starting Main CG Runs..." << std::endl;
  }
  for (int i = 0; i < numCgSets; ++i) {
    ZeroVector(x, main_queue).wait(); // Zero out x
#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif
    ierr = CG( A, data, b, x, maxItersPerCgSet, tolerance, niters, normr, normr0, &times[0], true, main_queue);
    if (ierr) HPCG_fout << "Error in call to CG: " << ierr << ".\n" << std::endl;
    if (rank==0) {
        HPCG_fout << "Call [" << i << "] with " << niters <<  " iters, Scaled Residual [" << normr/normr0 << "]" << std::endl;
    }
  }

  // Clean up
  DeleteMatrix(A, main_queue); // This delete will recursively delete all coarse grid data
  DeleteVector(x, main_queue);
  DeleteVector(b, main_queue);
  DeleteVector(xexact, main_queue);
  DeleteCGData(data, main_queue);

  HPCG_Finalize();
  // Finish up
#ifndef HPCG_NO_MPI
  MPI_Finalize();
#endif
  return 0;

}

#endif // HPCG_QUICK_RUN


/*!
  Main driver program: Construct synthetic problem, run V&V tests, compute benchmark parameters, run benchmark, report results.

  @param[in]  argc Standard argument count.  Should equal 1 (no arguments passed in) or 4 (nx, ny, nz passed in)
  @param[in]  argv Standard argument array.  If argc==1, argv is unused.  If argc==4, argv[1], argv[2], argv[3] will be interpreted as nx, ny, nz, resp.

  @return Returns zero on success and a non-zero value otherwise.

*/
int main(int argc, char * argv[]) {

#ifndef HPCG_NO_MPI
  MPI_Init(&argc, &argv);
#endif

  HPCG_Params params;

  HPCG_Init(&argc, &argv, params);

  // Catch asynchronous exceptions
  auto exception_handler = [](sycl::exception_list exceptions) {
      for (std::exception_ptr const &e : exceptions) {
          try {
              std::rethrow_exception(e);
          }
          catch (sycl::exception const &e) {
              std::cout << "Caught asynchronous SYCL "
                           "exception while running HPCG benchmark:\n"
                        << e.what() << std::endl;
          }
      }
  };

  int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID

#ifdef HPCG_DEBUG
  const std::string spaces = "    ";
#else
  const std::string spaces = "";
#endif

  //
  // Setup SYCL GPU platform and distribute MPI rank to hardware
  //
  sycl::platform plat = sycl::platform{sycl::gpu_selector_v};
  auto platform_cards = plat.get_devices(sycl::info::device_type::gpu);

  // check if cards on platform can be partitioned or not (assumes all cards are same)
  auto part_prop = platform_cards[0].get_info<sycl::info::device::partition_properties>();

#ifndef HPCG_NO_MPI
  char node_name[MPI_MAX_PROCESSOR_NAME];
  int name_len;
  MPI_Get_processor_name(node_name, &name_len);
#else
  char node_name[2] = "0"; // default name for non-mpi case
#endif

  int nranks_on_node = 1;
  int rank_on_node = 0;
#ifndef HPCG_NO_MPI
  { // extract a split communicator representing this node to get rank on node and nranks on node
      MPI_Comm shmcomm;
      MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
      MPI_Comm_rank(shmcomm, &rank_on_node);
      MPI_Comm_size(shmcomm, &nranks_on_node);
  }
#endif

  // we make the simplifying assumption that all cards attached to a node look the same, 
  // so we can determine sizes based on a single sample of card 0
  int tiles_per_card = 1;
  if (part_prop.empty()) {
    tiles_per_card = 1;
  } else {
      const auto affinity_partition = sycl::info::partition_property::partition_by_affinity_domain;
      for (int i = 0; i < part_prop.size(); i++ ) {
        if (part_prop[i] == affinity_partition) {
            sycl::device sample_card = platform_cards[0];
            std::vector<sycl::device> sample_tiles = sample_card.create_sub_devices<affinity_partition>(
                                                            sycl::info::partition_affinity_domain::numa);
            tiles_per_card = sample_tiles.size();
            break;
        }
        else {
            tiles_per_card = 1;
        }
      }
  }
 
  const int cards_per_node = platform_cards.size();
  const int tiles_per_node = cards_per_node * tiles_per_card;

  int card_number = 0;
  int tile_number = 0;
  if (params.affinity == AffinityPerNode::compact) {
      card_number = floor_div(rank_on_node, tiles_per_card) % cards_per_node; // wrap around cards per node
      tile_number = rank_on_node % tiles_per_card; // wrap around tiles per card
  }
  else if (params.affinity == AffinityPerNode::roundRobin) {
      card_number = rank_on_node % cards_per_node; // wrap around cards per node
      tile_number = floor_div(rank_on_node, cards_per_node) % tiles_per_card; // wrap around tiles per card
  }
  else {
    throw std::runtime_error("unexpected affinity per node");
  }

  sycl::device card = platform_cards[card_number];
  sycl::device dev;
  
  try {
      int tiles_per_card_detected = 0;
      if (tiles_per_card == 1) {
          dev = card;
          tiles_per_card_detected = 1;
      }
      else {
          std::vector<sycl::device> tiles =
              card.create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(
                  sycl::info::partition_affinity_domain::numa);

          tiles_per_card_detected = tiles.size();
          if (tiles_per_card_detected != tiles_per_card) {
            throw std::runtime_error("Unexpected number of tiles on this card.");
          }
          dev = tiles[tile_number];
      }
      
      //printf("rank %d, #ranks %d, tiles_per_card %d, cards_per_node %d, tiles_per_node %d, ranks_per_node %d, rank_on_node %d, card_number %d, tile_number %d\n",
      //        rank, size, tiles_per_card, cards_per_node, tiles_per_node, nranks_on_node, rank_on_node, card_number, tile_number); fflush(0);

      // print out serially
#ifndef HPCG_NO_MPI
      for (int p = 0; p < size; p++) {
          MPI_Barrier( MPI_COMM_WORLD);
          if ( (rank == p) && ( p==0 || p==size-1) ) {
#endif
              std::cout << "[" << rank << " / " << size << "] node " << node_name << ", card " << card_number << " / " << cards_per_node
                        << ", tile " << tile_number << " / " << tiles_per_card_detected << std::endl;
#ifndef HPCG_NO_MPI
          }
      }
      MPI_Barrier( MPI_COMM_WORLD);
#endif
  }
  catch (sycl::exception const& e) {
      if (e.code() == sycl::errc::feature_not_supported) {
          // running on hardware unsupported, or ZE_AFFINITY_MASK specifies a particular tile
          if (size > 1) {
              throw std::runtime_error("Can't allocate tiles to MPI ranks.");
          }
          dev = card;
          std::cout << "[" << rank << " / " << size << "] using GPU device" << std::endl;
      }
      else {
          throw;
      }
  }
  sycl::queue main_queue(dev, exception_handler);

#ifdef HPCG_TEST_NO_HALO_EXCHANGE
#ifndef HPCG_NO_MPI
  MPI_Barrier(MPI_COMM_WORLD);
#endif
  if (rank == 0) {
    std::cout << "#################################################################################" << std::endl
              << "########## Warning: the environment flag HPCG_TEST_NO_HALO_EXCHANGE is defined" << std::endl
              << "########## from the Makefile which turns off the Halo Exchange in all " << std::endl
              << "########## kernels. This is a debugging flag and leads to incorrectness in most" << std::endl
              << "########## kernels, so should not be used." << std::endl
              << "#################################################################################" << std::endl;
  }
#endif


#if defined(HPCG_QUICK_RUN)
  return run_quick_path(main_queue, rank, size, params);
#endif

  // Check if QuickPath option is enabled.
  // If the running time is set to zero, we minimize all paths through the program
  bool quickPath = (params.runningTime==0);

#ifdef HPCG_DETAILED_DEBUG
  if (size < 100 && rank==0) HPCG_fout << "Process "<<rank<<" of "<<size<<" is alive with " << params.numThreads << " threads." <<std::endl;

  if (rank==0) {
    char c;
    std::cout << "Press key to continue"<< std::endl;
    std::cin.get(c);
  }
#ifndef HPCG_NO_MPI
  MPI_Barrier(MPI_COMM_WORLD);
#endif
#endif // HPCG_DETAILED_DEBUG

  local_int_t nx,ny,nz;
  nx = (local_int_t)params.nx;
  ny = (local_int_t)params.ny;
  nz = (local_int_t)params.nz;
  int ierr = 0;  // Used to check return codes on function calls

  ierr = CheckAspectRatio(0.125, nx, ny, nz, "local problem", rank==0);
  if (ierr)
    return ierr;

  /////////////////////////
  // Problem setup Phase //
  /////////////////////////

#ifdef HPCG_DEBUG
  double t1 = mytimer();
  if (rank == 0) std::cout << "Problem Setup Phase ..." << std::endl;
#endif

  // Construct the geometry and linear system
  Geometry * geom = new Geometry;
  GenerateGeometry(size, rank, params.numThreads, params.pz, params.zl, params.zu, nx, ny, nz, params.npx, params.npy, params.npz, geom, main_queue);
  ierr = CheckAspectRatio(0.125, geom->npx, geom->npy, geom->npz, "process grid", rank==0);
  if (ierr)
    return ierr;

  // Use this array for collecting timing information
  std::vector< double > times(10,0.0);
  double setup_time = mytimer();

  SparseMatrix A;
  InitializeSparseMatrix(A, geom);

  Vector b, x, xexact;
  Vector b_host, x_host, xexact_host;
  GenerateProblem(A, &b, &x, &xexact, main_queue, params.runRealRef);
  SetupHalo(A, main_queue);

  int numberOfMgLevels = 4; // Number of levels including first
  SparseMatrix * curLevelMatrix = &A;
  for (int level = 1; level< numberOfMgLevels; ++level) {
      GenerateCoarseProblem(*curLevelMatrix, main_queue, params.runRealRef);
      curLevelMatrix = curLevelMatrix->Ac; // Make the just-constructed coarse grid the next level
  }

  setup_time = mytimer() - setup_time; // Capture total time of setup
  times[9] = setup_time; // Save it for reporting

  curLevelMatrix = &A;
  Vector * curb = &b;
  Vector * curx = &x;
  Vector * curxexact = &xexact;
  for (int level = 0; level< numberOfMgLevels; ++level) {
     CheckProblem(*curLevelMatrix, curb, curx, curxexact, main_queue).wait();
     curLevelMatrix = curLevelMatrix->Ac; // Make the nextcoarse grid the next level
     curb = 0; // No vectors after the top level
     curx = 0;
     curxexact = 0;
  }

#ifndef HPCG_LOCAL_LONG_LONG
  curLevelMatrix = &A;
  for (int level = 0; level< numberOfMgLevels; ++level)
  {
      sycl::free(curLevelMatrix->mtxG, main_queue);
      curLevelMatrix->mtxG = nullptr;
      curLevelMatrix = curLevelMatrix->Ac;
  }
#endif

  CGData data;
  InitializeSparseCGData(A, data, main_queue, params.runRealRef);


  ////////////////////////////////////
  // Reference SpMV+MG Timing Phase //
  ////////////////////////////////////

  // Call Reference SpMV and MG. Compute Optimization time as ratio of times in these routines

  local_int_t nrow = A.localNumberOfRows;
  local_int_t ncol = A.localNumberOfColumns;

  double refTolerance = 0.0;
  bool runReferenceCodeOnHost = params.runRealRef == 1; // Need to run reference?

  if (params.runRealRef == 2) {
      //                       geom->size,  nx,          ny,          nz
      using key_t = std::tuple<local_int_t, local_int_t, local_int_t, local_int_t>;

      // Table of precomputed refTolerances
      const std::map<key_t, double> refTolMap = {
          // {{geom->size, nx, ny, nz}, refTolerance},
          {{1, 128, 128, 128},  3.94530519509819e-07},
          {{1, 256, 256, 256},  0.00082199928694637},
          {{1, 320, 320, 320},  0.0024436051131212},
          {{1, 400, 400, 400},  0.00391325809415193},
          {{1, 512, 512, 256},  0.00332344878861052},
          {{1, 416, 416, 416},  0.00397611361783948},
          {{1, 424, 424, 424},  0.00400531021070102},
          {{1, 472, 448, 376},  0.00408008807954134},

          {{2, 320, 320, 320},  0.00373795511317884},
          {{2, 512, 512, 256},  0.00370083422126461},

          {{4, 320, 320, 320},  0.00502955321041538},
          {{4, 512, 512, 256},  0.00407582849623867},

          {{8, 512, 512, 256},  0.00477607052547384},

          {{12, 512, 512, 256}, 0.00485905892445569},

          {{24, 512, 512, 256}, 0.00489143658293543},
      };

      auto itr = refTolMap.find(std::make_tuple(geom->size, params.nx, params.ny, params.nz));

      // If config not found, run reference
      if (itr == refTolMap.end())
          runReferenceCodeOnHost = true;
      else
          refTolerance = itr->second;
  }

  if (runReferenceCodeOnHost) {
    // make copy of SparseMatrix data from device to host for running reference codes
    SparseMatrix * curLevelMatrix = &A;
    for (int level = 0; level < numberOfMgLevels; ++level) {
      AllocateAndFillReferenceData(*curLevelMatrix, main_queue);
      curLevelMatrix = curLevelMatrix->Ac;
    }

    // initialize host arrays for CG_ref
    InitializeVector(b_host, nrow);
    InitializeVector(x_host, ncol);
    InitializeVector(xexact_host, nrow);

    // copy local data from device to host as well (not USM host memory, so there is registration cost, but enables later steps to use in MPI)
    main_queue.memcpy(b_host.values, b.values, nrow*sizeof(double)).wait();
    main_queue.memcpy(x_host.values, x.values, nrow*sizeof(double)).wait();
    main_queue.memcpy(xexact_host.values, xexact.values, nrow*sizeof(double)).wait();

  }

  int numberOfCalls = 10;
  if (quickPath) numberOfCalls = 1; //QuickPath means we do on one call of each block of repetitive code
  double t_begin = mytimer();
  if (runReferenceCodeOnHost) {
      Vector x_overlap_h, b_computed_h;
      InitializeVector(x_overlap_h, ncol); // Overlapped copy of x vector
      InitializeVector(b_computed_h, nrow); // Computed RHS vector
      // Record execution time of reference SpMV and MG kernels for reporting times
      // First load vector with random values
      FillRandomVector(x_overlap_h);

      for (int i = 0; i < numberOfCalls; ++i) {
          ierr = ComputeSPMV_ref(A, x_overlap_h, b_computed_h); // b_computed = A*x_overlap
          if (ierr) HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << std::endl;
          ierr = ComputeMG_ref(A, b_computed_h, x_overlap_h); // b_computed = Minv*y_overlap
          if (ierr) HPCG_fout << "Error in call to MG: " << ierr << ".\n" << std::endl;
      }
      DeleteVector(x_overlap_h);
      DeleteVector(b_computed_h);
  }
  times[8] = (mytimer() - t_begin)/((double) numberOfCalls);  // Total time divided by number of calls.
#ifdef HPCG_DEBUG
  if (rank==0) HPCG_fout << "Total SpMV+MG timing phase execution time in main (sec) = " << mytimer() - t1 << std::endl;
#endif

  ///////////////////////////////
  // Reference CG Timing Phase //
  ///////////////////////////////

#ifdef HPCG_DEBUG
  t1 = mytimer();
  if (rank == 0) std::cout << "Reference CG Timing Phase ..." << std::endl; 
#endif
  int global_failure = 0; // assume all is well: no failures

  int niters = 0;
  int totalNiters_ref = 0;
  double normr = 0.0;
  double normr0 = 0.0;
  int refMaxIters = 50;
  numberOfCalls = 1; // Only need to run the residual reduction analysis once

  std::vector< double > ref_times(9,0.0);
  double tolerance = 0.0; // Set tolerance to zero to make all runs do maxIters iterations
  int err_count = 0;

  if (params.runRealRef == 0) {
      if (rank == 0) {
          std::cout << spaces << "Not officially usable HPCG result, as convergence properties compared to reference code is not measured. For usable results, set --run-real-ref=1 as runtime option" << std::endl;
      }
  }

  if (refTolerance > 0.0) {
      if (rank == 0) HPCG_fout << "Skipping CG_ref() but using tabulated value for reference tolerance" << std::endl;
      if (rank == 0) std::cout << spaces << "Skipping CG_ref() but using tabulated value for reference residual tolerance" << std::endl;
  }

  if (runReferenceCodeOnHost)
  {
#ifdef HPCG_DEBUG
      if (rank == 0) HPCG_fout << "Calling CG_ref() to get the reference tolerance" << std::endl;
      if (rank == 0) std::cout << spaces << "Calling CG_ref() " << numberOfCalls << " time(s) with " << refMaxIters << " iters to get reference residual tolerance from x0==0" << std::endl;
#endif

      // Compute the residual reduction for the natural ordering and reference kernels
      for (int i = 0; i < numberOfCalls; ++i)
      {
          ZeroVector(x_host);
          ierr = CG_ref( A, data, b_host, x_host, refMaxIters, tolerance, niters, normr, normr0, &ref_times[0], true);
          if (ierr) ++err_count; // count the number of errors in CG
          totalNiters_ref += niters;
      }

  }

  if (params.runRealRef != 0) {

#ifdef HPCG_DEBUG
    if (rank == 0) HPCG_fout << "Cleaning up reference data on host and calling OptimizeProblem()" << std::endl;
    if (rank == 0) std::cout << spaces << "Cleaning up reference data on host" << std::endl;
#endif

    //
    // get rid of reference data on host now that we are done using it
    //
    SparseMatrix * curLevelMatrix = &A;
    for (int level = 0; level < numberOfMgLevels; ++level) {
        FreeReferenceData(*curLevelMatrix, main_queue);
        curLevelMatrix = curLevelMatrix->Ac;
    }

#ifdef HPCG_DEBUG
    if (rank == 0) std::cout << spaces << "Calling OptimizeProblem()" << std::endl;
#endif


    // Call user-tunable set up function.
    double t7 = 0.0;
    const bool need_MKL_matrix = false;
    OptimizeProblem(&A, &b, t7, need_MKL_matrix, main_queue);
    times[7] = t7;

#ifdef HPCG_DEBUG
    if (rank==0) HPCG_fout << "Total problem setup time in main (sec) = " << mytimer() - t1 << std::endl;
#endif
#ifdef HPCG_DETAILED_DEBUG
    if (geom->size == 1) WriteProblem(*geom, A, b_host, x_host, xexact_host);
#endif
  }
  else {

#ifdef HPCG_DEBUG
    if (rank == 0) HPCG_fout << "Calling OptimizeProblem() then CG() to get the optimized residual tolerance" << std::endl;
    if (rank == 0) std::cout << spaces << "Calling OptimizeProblem()" << std::endl;
#endif


    // Call user-tunable set up function.
    double t7 = 0.0;
    const bool need_MKL_matrix = false;
    OptimizeProblem(&A, &b, t7, need_MKL_matrix, main_queue);
    times[7] = t7;

#ifdef HPCG_DEBUG
    if (rank == 0) std::cout << spaces << "Calling CG() " << numberOfCalls << " time(s) with " << refMaxIters << " iters to get optimized residual tolerance from x0==0" << std::endl;
#endif

    // Compute the residual reduction for the Optimized CG from x==0
    for (int i = 0; i < numberOfCalls; ++i)
    {
        ZeroVector(x, main_queue).wait();
        ierr = CG( A, data, b, x, refMaxIters, tolerance, niters, normr, normr0, &ref_times[0], true, main_queue);
        if (ierr) ++err_count; // count the number of errors in CG
        totalNiters_ref += niters;
    }

#ifdef HPCG_DEBUG
    if (rank==0) HPCG_fout << "Total problem setup time in main (sec) = " << mytimer() - t1 << std::endl;
#endif
#ifdef HPCG_DETAILED_DEBUG
    if (geom->size == 1) WriteProblem(*geom, A, b, x, xexact);
#endif
  }

  if (rank == 0 && err_count) HPCG_fout << err_count << " error(s) in call(s) to reference CG." << std::endl;
  if (refTolerance <= 0.0) {
      refTolerance = normr / normr0;
  }

  if (rank == 0) HPCG_fout << "After that normr = " << std::setprecision(15) << normr << ", normr0 = " << normr0 << std::endl;
  if (params.runRealRef) {
      if (rank == 0) HPCG_fout << "After that refTolerance = " << std::setprecision(15) << refTolerance << std::endl;
  }
  else {
      if (rank == 0) HPCG_fout << "After that (not officially usable) refTolerance = " << std::setprecision(15) << refTolerance << std::endl;
  }
  if (rank == 0) HPCG_fout << "After that totalNiters_ref = " <<  totalNiters_ref << std::endl;

  //////////////////////////////
  // Validation Testing Phase //
  //////////////////////////////

#ifdef HPCG_DEBUG
  t1 = mytimer();
  std::cout << "Starting Validation Testing Phase ..." << std::endl;
#endif
  TestCGData testcg_data;
  testcg_data.count_pass = testcg_data.count_fail = 0;
  TestCG(A, data, b, x, testcg_data, main_queue);

  TestSymmetryData testsymmetry_data;
  TestSymmetry(A, b, xexact, testsymmetry_data, main_queue);

  // Verification check
  if (rank == 0) {
      if (testcg_data.count_fail==0)
          std::cout << spaces << "CG Verification success" << std::endl;
      else
          std::cout << spaces << "CG verification failed count_fail: " << testcg_data.count_fail << std::endl;

      if (testsymmetry_data.count_fail==0)
          std::cout << spaces << "Symmetry Verification success" << std::endl;
      else
          std::cout << spaces << "Symmetry verification failed count_fail: " << testsymmetry_data.count_fail << std::endl;
  }

#ifdef HPCG_DEBUG
  if (rank==0) HPCG_fout << "Total validation (TestCG and TestSymmetry) execution time in main (sec) = " << mytimer() - t1 << std::endl;
#endif

#ifndef HPCG_LOCAL_LONG_LONG
  curLevelMatrix = &A;
  // done with matrixDiagonal after TestCG(), so delete it
  for (int level = 0; level< numberOfMgLevels; ++level)
  {
      sycl::free(curLevelMatrix->matrixDiagonal, main_queue);
      curLevelMatrix->matrixDiagonal = nullptr;
      curLevelMatrix = curLevelMatrix->Ac;
  }
#endif

#ifdef HPCG_DEBUG
  if (rank == 0) {
      std::cout << "Starting Optimized CG Setup Phase ..." << std::endl;
      std::cout << spaces << "Goal: determine number of iters per optimized CG Set to match previously computed residual tolerance from x0==0 " << std::endl;
  }
  t1 = mytimer();
#endif

  //////////////////////////////
  // Optimized CG Setup Phase //
  //////////////////////////////

  niters = 0;
  normr = 0.0;
  normr0 = 0.0;
  err_count = 0;
  int tolerance_failures = 0;

  int optMaxIters = 10*refMaxIters;
  int optNiters = refMaxIters;
  double opt_worst_time = 0.0;

  std::vector<double> opt_times(9,0.0);

  // Compute the residual reduction and residual count for the user ordering and optimized kernels.
  for (int i=0; i< numberOfCalls; ++i) {
    ZeroVector(x, main_queue).wait(); // start x at all zeros
    double last_cummulative_time = opt_times[0];
    ierr = CG( A, data, b, x, optMaxIters, refTolerance, niters, normr, normr0, &opt_times[0], true, main_queue);
    if (ierr) ++err_count; // count the number of errors in CG
//    if (normr / normr0 >= refTolerance) ++tolerance_failures; // the number of failures to reduce residual
    // Convergence check accepts an error of no more than 6 significant digits of relTolerance
    double ff = normr/normr0-refTolerance*(1.0+1e-6);
    if (ff >= DBL_EPSILON) ++tolerance_failures; // the number of failures to reduce residual

    // pick the largest number of iterations to guarantee convergence
    if (niters > optNiters) optNiters = niters;

    double current_time = opt_times[0] - last_cummulative_time;
    if (current_time > opt_worst_time) opt_worst_time = current_time;
  }

#ifndef HPCG_NO_MPI
// Get the absolute worst time across all MPI ranks (time in CG can be different)
  double local_opt_worst_time = opt_worst_time;
  MPI_Allreduce(&local_opt_worst_time, &opt_worst_time, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
#endif

  if (rank == 0 && err_count) HPCG_fout << err_count << " error(s) in call(s) to optimized CG." << std::endl;
  if (tolerance_failures) {
    global_failure = 1;
    if (rank == 0)
      HPCG_fout << "Failed to reduce the residual " << tolerance_failures << " times." << std::endl;
  }

#ifdef HPCG_DEBUG
  if (rank == 0) {
      std::cout << spaces << "Number of iters per reference CG set: " << 50 << std::endl;
      std::cout << spaces << "Number of iters per optimized CG set: " << optNiters << std::endl;
  }
#endif

  ///////////////////////////////
  // Optimized CG Timing Phase //
  ///////////////////////////////

  // Here we finally run the benchmark phase
  // The variable total_runtime is the target benchmark execution time in seconds

  double total_runtime = params.runningTime;
  int numberOfCgSets = int(total_runtime / opt_worst_time) + 1; // Run at least once, account for rounding

#ifdef HPCG_DEBUG
  if (rank==0) {
      HPCG_fout << "Projected running time: " << total_runtime << " seconds" << std::endl;
      HPCG_fout << "Number of CG sets: " << numberOfCgSets << std::endl;

      std::cout << "Starting Optimized CG Timing Phase ..." << std::endl;
      std::cout << spaces << "Projected running time: " << total_runtime << " seconds" << std::endl;
      std::cout << spaces << "Number of CG sets     : " << numberOfCgSets << std::endl;
  }
#endif

#ifdef BASIC_PROFILING
  struct optData *optData = (struct optData *)A.optimizationData;
  // write setup etc profiling out, main profile is for the timed section
  optData->profiler->Write("hpcg_setup.json");
  optData->profiler->Clear();
#endif
  /* This is the timed run for a specified amount of time. */

  optMaxIters = optNiters;
  double optTolerance = 0.0;  // Force optMaxIters iterations
  TestNormsData testnorms_data;
  testnorms_data.samples = numberOfCgSets;
  testnorms_data.values = new double[numberOfCgSets];

  for (int i=0; i < numberOfCgSets; ++i) {
    ZeroVector(x, main_queue).wait(); // Zero out x
#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif
#ifdef BASIC_PROFILING
    if (i==1) {
        setenv("PTI_ENABLE_COLLECTION", "1", 1);
    }
#endif
    ierr = CG( A, data, b, x, optMaxIters, optTolerance, niters, normr, normr0, &times[0], true, main_queue);
#ifdef BASIC_PROFILING
    if (i==1) {
        unsetenv("PTI_ENABLE_COLLECTION");
    }
#endif
    if (ierr) HPCG_fout << "Error in call to CG: " << ierr << ".\n" << std::endl;
    if (rank==0) HPCG_fout << "Call [" << i << "] Scaled Residual [" << normr/normr0 << "]" << std::endl;
    testnorms_data.values[i] = normr/normr0; // Record scaled residual from this run
  }

  // Compute difference between known exact solution and computed solution
  // All processors are needed here.
#ifdef HPCG_DEBUG
  double residual = 0;
  ComputeResidual(A.localNumberOfRows, x, xexact, residual, ierr, main_queue, {}).wait();
  if (ierr) HPCG_fout << "Error in call to compute_residual: " << ierr << ".\n" << std::endl;
  if (rank==0) HPCG_fout << "Difference between computed and exact  = " << residual << ".\n" << std::endl;
#endif

  // Test Norm Results
  ierr = TestNorms(testnorms_data);

  ////////////////////
  // Report Results //
  ////////////////////
  // Report results to YAML file
  ReportResults(A, numberOfMgLevels, numberOfCgSets, refMaxIters, optMaxIters, &times[0], testcg_data, testsymmetry_data, testnorms_data, global_failure, quickPath, params);
#ifdef BASIC_PROFILING
  optData->profiler->Write("hpcg.json");
#endif

  // Clean up
  DeleteMatrix(A, main_queue); // This delete will recursively delete all coarse grid data
  DeleteCGData(data, main_queue);

  DeleteVector(x, main_queue);
  DeleteVector(b, main_queue);
  DeleteVector(xexact, main_queue);

  DeleteVector(x_host);
  DeleteVector(b_host);
  DeleteVector(xexact_host);

  delete [] testnorms_data.values;

  HPCG_Finalize();
  // Finish up
#ifndef HPCG_NO_MPI
  MPI_Finalize();
#endif
  return 0;

}
