/*******************************************************************************
 * Copyright 2012 Intel Corporation.
 *
 *
 * This software and the related documents are Intel copyrighted materials, and your use of them is governed by
 * the express license under which they were provided to you ('License'). Unless the License provides otherwise,
 * you may not use, modify, copy, publish, distribute, disclose or transmit this software or the related
 * documents without Intel's prior written permission.
 * This software and the related documents are provided as is, with no express or implied warranties, other than
 * those that are expressly stated in the License.
 *******************************************************************************/

/* Intel(R) Integrated Performance Primitives (Intel(R) IPP) */

#include <math.h>
#include <memory>

#include "vm_thread.h"

#include "base.h"
#include "base_image.h"
#include "base_ipp.h"
#include "base_renderer.h"

#include "ipp/ippcore.h"
#include "ipp/ipps.h"
#include "ipp/ippi.h"

#ifdef USE_TBB
  // #define __TBB_NO_IMPLICIT_LINKAGE 1
  #define TBB_PREVIEW_MEMORY_POOL 1
  #include "tbb/task_arena.h"
  #include "tbb/parallel_for.h"
  #include "tbb/blocked_range2d.h"
  #include "tbb/memory_pool.h"
  #define TBB_PREVIEW_GLOBAL_CONTROL 1
  #include "tbb/global_control.h"

using namespace tbb;
#endif

#define CMD_METHOD_TBB 0
#define CMD_METHOD_VM  1

static void printVersion()
{
    const IppLibraryVersion *pVersion;
    printf("\nIntel(R) IPP:\n");
    PRINT_LIB_VERSION(, pVersion)
    PRINT_LIB_VERSION(s, pVersion)
    PRINT_LIB_VERSION(i, pVersion)
}

static void printHelp(const cmd::OptDef pOptions[], char *argv[])
{
    printf("\nUsage: %s [-i] InputFile [[-o] OutputFile] [Options]\n", GetProgName(argv));
    printf("Options:\n");
    cmd::OptUsage(pOptions);
}

class Harmonize
{
public:
    Harmonize()
    {
        m_iThreads = 0;

        m_filterBoxMask.width = 7;
        m_filterBoxMask.height = 7;
        m_filterBoxAnchor.x = 3;
        m_filterBoxAnchor.y = 3;

        m_iMulVal1[0] = m_iMulVal1[1] = m_iMulVal1[2] = m_iMulVal1[3] = 255;
        m_iMulVal2[0] = m_iMulVal2[1] = m_iMulVal2[2] = m_iMulVal2[3] = 205;
        m_iThresLTVal[0] = m_iThresLTVal[1] = m_iThresLTVal[2] = m_iThresLTVal[3] = 6;
        m_iThresGTVal[0] = m_iThresGTVal[1] = m_iThresGTVal[2] = m_iThresGTVal[3] = 250;
    }

    virtual ~Harmonize() { Close(); }

    virtual void Close() {}

    virtual Status Init(Image *pSrcImage, Image *pDstImage)
    {
        if (!pSrcImage || !pSrcImage->ptr() || !pDstImage)
            return STS_ERR_NULL_PTR;

        if (pSrcImage->m_samples != 1 && pSrcImage->m_samples != 3 && pSrcImage->m_samples != 4)
            return STS_ERR_INVALID_PARAMS;

        m_tmpImage = *pSrcImage;
        m_tmpImage.Alloc();

        m_templ = *pSrcImage;

        return STS_OK;
    }

    Status HarmonizeBlock(Image *pSrcImage, Image *pDstImage, Rect roi, IppiBorderType border, unsigned char *pExtBuffer = 0)
    {
        Status status;
        IppStatus ippSts;
        IppiSize dstRoiSize = {(int)roi.width, (int)roi.height};

        unsigned char iBorderValue = 0;

        unsigned char *pSrcPtr = 0;
        unsigned char *pDstPtr = 0;
        unsigned char *pTmpPtr = 0;
        unsigned char *pBuffer = 0;
        int iBufferSize = 0;

        if (!pSrcImage || !pDstImage)
            return STS_ERR_NULL_PTR;

        // Zero size mean full size
        if (!dstRoiSize.width)
            dstRoiSize.width = (int)pDstImage->m_size.width;
        if (!dstRoiSize.height)
            dstRoiSize.height = (int)pDstImage->m_size.height;

        if (m_templ != *pSrcImage) {
            status = Init(pSrcImage, pDstImage);
            CHECK_STATUS_PRINT_RS(status, "Harmonize::Init()", GetBaseStatusString(status));
        }

        // adjust input and output buffers to current ROI
        pSrcPtr = (unsigned char *)pSrcImage->ptr((size_t)roi.y, (size_t)roi.x);
        pDstPtr = (unsigned char *)pDstImage->ptr((size_t)roi.y, (size_t)roi.x);
        pTmpPtr = (unsigned char *)m_tmpImage.ptr((size_t)roi.y, (size_t)roi.x);

        if (!pExtBuffer) {
            ippSts = ippiFilterBoxBorderGetBufferSize(dstRoiSize, m_filterBoxMask, ipp8u, pSrcImage->m_samples, &iBufferSize);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiFilterBoxBorderGetBufferSize()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            pBuffer = ippsMalloc_8u(iBufferSize);
            if (!pBuffer) {
                PRINT_MESSAGE("Cannot allocate memory for FilterBox buffer");
                return STS_ERR_ALLOC;
            }
        } else
            pBuffer = pExtBuffer;

        if (pSrcImage->m_samples == 1) {
            // Apply box filter
            ippSts = ippiFilterBoxBorder_8u_C1R(pSrcPtr, (int)pSrcImage->m_step, pTmpPtr, (int)m_tmpImage.m_step, dstRoiSize, m_filterBoxMask, border,
                                                &iBorderValue, pBuffer);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiFilterBoxBorder_8u_C1R()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Multiply by a constant in-place
            ippSts = ippiMulC_8u_C1RSfs(pTmpPtr, (int)m_tmpImage.m_step, m_iMulVal1[0], pTmpPtr, (int)m_tmpImage.m_step, dstRoiSize, 12);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiMulC_8u_C1RSfs()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Subtract
            ippSts =
                ippiSub_8u_C1RSfs(pTmpPtr, (int)m_tmpImage.m_step, pSrcPtr, (int)pSrcImage->m_step, pDstPtr, (int)pDstImage->m_step, dstRoiSize, 0);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiSub_8u_C1RSfs()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Multiply by a constant in-place
            ippSts = ippiMulC_8u_C1RSfs(pDstPtr, (int)pDstImage->m_step, m_iMulVal2[0], pDstPtr, (int)pDstImage->m_step, dstRoiSize, 8);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiMulC_8u_C1RSfs()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Apply threshold
            ippSts = ippiThreshold_LTValGTVal_8u_C1R(pDstPtr, (int)pDstImage->m_step, pDstPtr, (int)pDstImage->m_step, dstRoiSize, m_iThresLTVal[0],
                                                     m_iThresLTVal[0], m_iThresGTVal[0], m_iThresGTVal[0]);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiThreshold_LTValGTVal_8u_C1R()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
        } else if (pSrcImage->m_samples == 3) {
            // Apply box filter
            ippSts = ippiFilterBoxBorder_8u_C3R(pSrcPtr, (int)pSrcImage->m_step, pTmpPtr, (int)m_tmpImage.m_step, dstRoiSize, m_filterBoxMask, border,
                                                &iBorderValue, pBuffer);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiFilterBoxBorder_8u_C3R()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Multiply by a constant in-place
            ippSts = ippiMulC_8u_C3RSfs(pTmpPtr, (int)m_tmpImage.m_step, m_iMulVal1, pTmpPtr, (int)m_tmpImage.m_step, dstRoiSize, 12);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiMulC_8u_C3RSfs()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Subtract
            ippSts =
                ippiSub_8u_C3RSfs(pTmpPtr, (int)m_tmpImage.m_step, pSrcPtr, (int)pSrcImage->m_step, pDstPtr, (int)pDstImage->m_step, dstRoiSize, 0);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiSub_8u_C3RSfs()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Multiply by a constant in-place
            ippSts = ippiMulC_8u_C3RSfs(pDstPtr, (int)pDstImage->m_step, m_iMulVal2, pDstPtr, (int)pDstImage->m_step, dstRoiSize, 8);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiMulC_8u_C3RSfs()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Apply threshold
            ippSts = ippiThreshold_LTValGTVal_8u_C3R(pDstPtr, (int)pDstImage->m_step, pDstPtr, (int)pDstImage->m_step, dstRoiSize, m_iThresLTVal,
                                                     m_iThresLTVal, m_iThresGTVal, m_iThresGTVal);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiThreshold_LTValGTVal_8u_C3R()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
        } else if (pSrcImage->m_samples == 4) {
            // Apply box filter
            ippSts = ippiFilterBoxBorder_8u_C4R(pSrcPtr, (int)pSrcImage->m_step, pTmpPtr, (int)m_tmpImage.m_step, dstRoiSize, m_filterBoxMask, border,
                                                &iBorderValue, pBuffer);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiFilterBoxBorder_8u_C4R()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Multiply by a constant in-place
            ippSts = ippiMulC_8u_C4RSfs(pTmpPtr, (int)m_tmpImage.m_step, m_iMulVal1, pTmpPtr, (int)m_tmpImage.m_step, dstRoiSize, 12);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiMulC_8u_C4RSfs()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Subtract
            ippSts =
                ippiSub_8u_C4RSfs(pTmpPtr, (int)m_tmpImage.m_step, pSrcPtr, (int)pSrcImage->m_step, pDstPtr, (int)pDstImage->m_step, dstRoiSize, 0);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiSub_8u_C4RSfs()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Multiply by a constant in-place
            ippSts = ippiMulC_8u_C4RSfs(pDstPtr, (int)pDstImage->m_step, m_iMulVal2, pDstPtr, (int)pDstImage->m_step, dstRoiSize, 8);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiMulC_8u_C4RSfs()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            // Apply threshold
            ippSts = ippiThreshold_LTValGTVal_8u_AC4R(pDstPtr, (int)pDstImage->m_step, pDstPtr, (int)pDstImage->m_step, dstRoiSize, m_iThresLTVal,
                                                      m_iThresLTVal, m_iThresGTVal, m_iThresGTVal);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiThreshold_LTValGTVal_8u_AC4R()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
        }

        if (!pExtBuffer)
            ippsFree(pBuffer);

        return STS_OK;
    }

    virtual Status HarmonizeImage(Image *pSrcImage, Image *pDstImage)
    {
        if (!pSrcImage || !pDstImage)
            return STS_ERR_NULL_PTR;

        Rect roi(pDstImage->m_size.width, pDstImage->m_size.height);

        return HarmonizeBlock(pSrcImage, pDstImage, roi, ippBorderRepl);
    }

public:
    unsigned int m_iThreads;

    IppiSize m_filterBoxMask;
    IppiPoint m_filterBoxAnchor;

    unsigned char m_iMulVal1[4];
    unsigned char m_iMulVal2[4];
    unsigned char m_iThresLTVal[4];
    unsigned char m_iThresGTVal[4];

protected:
    Image m_templ;
    Image m_tmpImage;
};

class HarmonizeVM : public Harmonize
{
public:
    HarmonizeVM()
    {
        m_pThreads = 0;
        m_pParams = 0;
    }

    virtual ~HarmonizeVM() { Close(); }

    virtual void Close()
    {
        unsigned int i;

        Harmonize::Close();

        if (m_pThreads) {
            for (i = 0; i < m_iThreads; i++) {
                m_pParams[i].bQuit = true;
                vm_event_set(&m_pParams[i].eStart, 1);
            }
            for (i = 0; i < m_iThreads; i++) {
                vm_thread_destroy(&m_pThreads[i]);

                if (m_pParams[i].pBuffer)
                    ippsFree(m_pParams[i].pBuffer);
            }

            delete[] m_pThreads;
            delete[] m_pParams;
            m_pThreads = 0;
            m_pParams = 0;
        }
    }

    virtual Status Init(Image *pSrcImage, Image *pDstImage)
    {
        IppStatus ippSts;
        Status status;

        status = Harmonize::Init(pSrcImage, pDstImage);
        CHECK_STATUS_PRINT_RS(status, "Harmonize::Init()", GetBaseStatusString(status));

        if (m_iThreads == 0) // automatic threads number
            m_iThreads = vm_sys_info_get_avail_cpu_num();
        if (m_iThreads == 0) // failsafe
            m_iThreads = 1;

        m_pThreads = new vm_thread[m_iThreads];
        m_pParams = new TaskParams[m_iThreads];
        if (!m_pThreads || !m_pParams)
            return STS_ERR_ALLOC;

        unsigned int iBlockSize = (unsigned int)(pSrcImage->m_size.height / m_iThreads);
        unsigned int iRemainder = (unsigned int)(pSrcImage->m_size.height - iBlockSize * m_iThreads);

        IppiSize roiSize;
        int iBufferSize = 0;

        for (unsigned int i = 0; i < m_iThreads; i++) {
            m_pParams[i].roi.x = 0;
            m_pParams[i].roi.y = i * iBlockSize;
            m_pParams[i].roi.width = pSrcImage->m_size.width;
            m_pParams[i].roi.height = (i == m_iThreads) ? iRemainder : iBlockSize;

            if (i != 0) // non-top
                m_pParams[i].border = (IppiBorderType)(m_pParams[i].border | ippBorderInMemTop);
            if (i != (m_iThreads - 1)) // non-bottom
                m_pParams[i].border = (IppiBorderType)(m_pParams[i].border | ippBorderInMemBottom);

            roiSize.width = (int)m_pParams[i].roi.width;
            roiSize.height = (int)m_pParams[i].roi.height;

            ippSts = ippiFilterBoxBorderGetBufferSize(roiSize, m_filterBoxMask, ipp8u, pSrcImage->m_samples, &iBufferSize);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiFilterBoxBorderGetBufferSize()", ippGetStatusString(ippSts), return STS_ERR_FAILED);

            m_pParams[i].pBuffer = ippsMalloc_8u(iBufferSize);
            if (!m_pParams[i].pBuffer)
                return STS_ERR_ALLOC;

            m_pParams[i].pSrcData = pSrcImage;
            m_pParams[i].pDstData = pDstImage;
            m_pParams[i].pHarmonize = this;
            m_pParams[i].bQuit = false;

            vm_thread_construct(&m_pThreads[i]);
            if (vm_thread_create(&m_pThreads[i], HarmonizeVMTask, &m_pParams[i]) < VM_OK)
                return STS_ERR_INIT;
        }

        return STS_OK;
    }

    Status HarmonizeImage(Image *pSrcImage, Image *pDstImage)
    {
        unsigned int i;

        if (!pSrcImage || !pDstImage)
            return STS_ERR_NULL_PTR;

        if (m_templ != *pSrcImage) {
            Status status = Init(pSrcImage, pDstImage);
            CHECK_STATUS_PRINT_RS(status, "Harmonize::Init()", GetBaseStatusString(status));
        }

        for (i = 0; i < m_iThreads; i++)
            vm_event_set(&m_pParams[i].eStart, 1);

        for (i = 0; i < m_iThreads; i++)
            vm_event_wait(&m_pParams[i].eEnd, VM_WAIT_INFINITE);

        return STS_OK;
    }

private:
    HarmonizeVM(const HarmonizeVM &other);
    HarmonizeVM &operator=(const HarmonizeVM &other);

    struct TaskParams {
        TaskParams()
        {
            vm_event_construct(&eStart);
            vm_event_construct(&eEnd);
            vm_event_init(&eStart, 0, 0);
            vm_event_init(&eEnd, 0, 0);

            pBuffer = NULL;
            border = ippBorderRepl;

            bQuit = false;
            pHarmonize = NULL;
            pSrcData = NULL;
            pDstData = NULL;
            roi = Rect();
        }

        ~TaskParams()
        {
            vm_event_destroy(&eStart);
            vm_event_destroy(&eEnd);
        }

        unsigned char *pBuffer;
        IppiBorderType border;

        Harmonize *pHarmonize;
        Image *pSrcData;
        Image *pDstData;
        Rect roi;

        vm_event eStart;
        vm_event eEnd;
        volatile bool bQuit;

    private:
        TaskParams(const TaskParams &other);
        TaskParams &operator=(const TaskParams &other);
    };

    static unsigned int VM_THREAD_CALLCONVENTION HarmonizeVMTask(void *pParams)
    {
        Status status;
        TaskParams *pPar = (TaskParams *)pParams;

        while (!pPar->bQuit) {
            vm_event_wait(&pPar->eStart, VM_WAIT_INFINITE);
            if (pPar->bQuit)
                break;

            status = pPar->pHarmonize->HarmonizeBlock(pPar->pSrcData, pPar->pDstData, pPar->roi, pPar->border, pPar->pBuffer);
            if (status != STS_OK)
                return 1;

            vm_event_set(&pPar->eEnd, 1);
        }

        return 0;
    }

private:
    vm_thread *m_pThreads;
    TaskParams *m_pParams;
};

#ifdef USE_TBB
class HarmonizeTBB : public Harmonize
{
public:
    HarmonizeTBB()
    {
        m_iGrainX = 0;
        m_iGrainY = 0;
    }

    virtual Status Init(Image *pSrcImage, Image *pDstImage)
    {
        Status status;

        status = Harmonize::Init(pSrcImage, pDstImage);
        CHECK_STATUS_PRINT_RS(status, "Harmonize::Init()", GetBaseStatusString(status));

        if (m_iThreads == 0) // automatic threads number
        {
  #if TBB_INTERFACE_VERSION >= 9100
            m_iThreads = this_task_arena::max_concurrency();
  #else
            m_iThreads = task_arena::max_concurrency();
  #endif
        }

        if (!m_iGrainX)
            m_iGrainX = (unsigned int)(pDstImage->m_size.width + m_iThreads - 1) / m_iThreads;

        if (!m_iGrainY)
            m_iGrainY = (unsigned int)(pDstImage->m_size.height + m_iThreads - 1) / m_iThreads;

        m_task = HarmonizeTBBTask(this, pSrcImage, pDstImage);

        return STS_OK;
    }

    Status HarmonizeImage(Image *pSrcImage, Image *pDstImage)
    {
        if (!pSrcImage || !pDstImage)
            return STS_ERR_NULL_PTR;

        if (m_templ != *pSrcImage) {
            Status status = Init(pSrcImage, pDstImage);
            CHECK_STATUS_PRINT_RS(status, "Harmonize::Init()", GetBaseStatusString(status));
        }

        blocked_range2d<unsigned int, unsigned int> tbbRange(0, (unsigned int)pDstImage->m_size.height, m_iGrainY, 0,
                                                             (unsigned int)pDstImage->m_size.width, m_iGrainX);

        try {
            global_control set_num_threads(global_control::max_allowed_parallelism, m_iThreads);
            parallel_for(tbbRange, m_task, m_part_auto);
        } catch (Status status) {
            return status;
        }

        return STS_OK;
    }

private:
    class HarmonizeTBBTask
    {
    public:
        HarmonizeTBBTask() : m_pHarmonize(nullptr), m_pSrcData(nullptr), m_pDstData(nullptr) {}

        HarmonizeTBBTask(HarmonizeTBB *const pHarmonize, Image *const pSrcData, Image *const pDstData)
            : m_pHarmonize(pHarmonize), m_pSrcData(pSrcData), m_pDstData(pDstData)
        {
        }

        void operator()(blocked_range2d<unsigned int, unsigned int> &r) const
        {
            IppStatus ippSts;
            Status status;
            Rect roi;
            IppiSize dstRoiSize = {0};
            IppiBorderType border = ippBorderRepl;

            unsigned char *pBuffer = 0;
            int iBufferSize;

            roi.x = r.cols().begin();
            roi.y = r.rows().begin();
            roi.width = r.cols().end() - r.cols().begin();
            roi.height = r.rows().end() - r.rows().begin();

            dstRoiSize.width = (int)roi.width;
            dstRoiSize.height = (int)roi.height;

            ippSts = ippiFilterBoxBorderGetBufferSize(dstRoiSize, m_pHarmonize->m_filterBoxMask, ipp8u, m_pSrcData->m_samples, &iBufferSize);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiFilterBoxBorderGetBufferSize()", ippGetStatusString(ippSts), throw(STS_ERR_FAILED));

            pBuffer = (unsigned char *)m_pHarmonize->m_memPool.malloc(iBufferSize);
            if (!pBuffer) {
                PRINT_MESSAGE("Cannot allocate memory for FilterBox buffer");
                throw(STS_ERR_ALLOC);
            }

            if (roi.x != 0) // non-left
                border = (IppiBorderType)(border | ippBorderInMemLeft);
            if (roi.y != 0) // non-top
                border = (IppiBorderType)(border | ippBorderInMemTop);
            if (r.cols().end() != m_pSrcData->m_size.width) // non-right
                border = (IppiBorderType)(border | ippBorderInMemRight);
            if (r.rows().end() != m_pSrcData->m_size.height) // non-bottom
                border = (IppiBorderType)(border | ippBorderInMemBottom);

            status = m_pHarmonize->HarmonizeBlock(m_pSrcData, m_pDstData, roi, border, pBuffer);
            if (status != STS_OK)
                throw(status);

            m_pHarmonize->m_memPool.free(pBuffer);
        }

        HarmonizeTBB *m_pHarmonize;
        Image *m_pSrcData;
        Image *m_pDstData;
    };

public:
    unsigned int m_iGrainX;
    unsigned int m_iGrainY;

private:
    HarmonizeTBBTask m_task;
    auto_partitioner m_part_auto;
    memory_pool<scalable_allocator<unsigned char>> m_memPool;
};
#endif

int main(int argc, char *argv[])
{
    /*
    // Variables initialization
    */
    Status status = STS_OK;
    DString sInputFile = CheckTestDirs(BMP_GRAYSCALE_FILE);
    DString sOutputFile;
    char *sIppCpu = 0;
    unsigned int iThreads = 0;
    bool bNoWindow = false;
    bool bPrintHelp = false;

    Image srcData;
    Image dstData;
#ifdef USE_TBB
    const char *sMethod = "tbb";
#else
    const char *sMethod = "native";
#endif
    int iMethod;
    Harmonize *pHarmonize = 0;

    // General timing
    vm_tick tickStart = 0;
    vm_tick tickAcc = 0;
    vm_tick tickFreq = vm_time_get_frequency() / 1000;
    double fTime = 0;
    unsigned int iTimeLimit = 0;
    unsigned int iLoops = 0;
    unsigned int iLoopsLimit = 0;

    /*
    // Cmd parsing
    */
    const cmd::OptDef cmdOpts[] = {{'i', "", 1, cmd::KT_DSTRING, cmd::KF_OPTIONAL, &sInputFile, "input file name"},
                                   {'o', "", 1, cmd::KT_DSTRING, cmd::KF_OPTIONAL, &sOutputFile, "output file name"},
#ifdef USE_TBB
                                   {'m', "", 1, cmd::KT_STRING, 0, &sMethod, "threading method: tbb (default), native"},
#endif
                                   {'t', "", 1, cmd::KT_INTEGER, 0, &iThreads, "number of threads (0 - auto, 0 by default)"},
#if defined(ENABLE_RENDERING)
                                   {'s', "", 1, cmd::KT_BOOL, 0, &bNoWindow, "suppress window output"},
#endif
                                   {'w', "", 1, cmd::KT_POSITIVE, 0, &iTimeLimit, "minimum test time in milliseconds"},
                                   {'l', "", 1, cmd::KT_POSITIVE, 0, &iLoopsLimit, "number of loops (overrides test time)"},
                                   {'T', "", 1, cmd::KT_STRING, 0, &sIppCpu, "target Intel(R) IPP optimization (" IPP_OPT_LIST ")"},
                                   {'h', "", 1, cmd::KT_BOOL, 0, &bPrintHelp, "print help and exit"},
                                   {0}};

    if (cmd::OptParse(argc, argv, cmdOpts) || iThreads > 90) // 90 is some large number preventing from too many threads
    {
        printHelp(cmdOpts, argv);
        PRINT_MESSAGE("invalid input parameters");
        return 1;
    }

    InitPreferredCpu(sIppCpu);

    printVersion();

    // Check default image availability
    if (!strcmp(sInputFile.c_str(), BMP_GRAYSCALE_FILE)) {
        bPrintHelp = (-1 == vm_file_access(sInputFile.c_str(), 0));
    }

    if (bPrintHelp) {
        printHelp(cmdOpts, argv);
        return 0;
    }

    if (!sInputFile.Size()) {
        printHelp(cmdOpts, argv);
        PRINT_MESSAGE("Cannot open input file");
        return 1;
    }

    if (!vm_string_stricmp(sMethod, "tbb"))
        iMethod = CMD_METHOD_TBB;
    else if (!vm_string_stricmp(sMethod, "native"))
        iMethod = CMD_METHOD_VM;
    else {
        printHelp(cmdOpts, argv);
        PRINT_MESSAGE("Invalid threading method");
        return 1;
    }

    for (;;) {
        // Read from file
        printf("\nInput file: %s\n", sInputFile.c_str());
        status = srcData.Read(sInputFile.c_str());
        CHECK_STATUS_PRINT_BR(status, "Image::Read()", GetBaseStatusString(status));
        printf("Input info: %dx%d %s\n", (int)srcData.m_size.width, (int)srcData.m_size.height, colorFormatName[srcData.m_color]);

        // Prepare destination buffer
        dstData = srcData;
        status = dstData.Alloc();
        CHECK_STATUS_PRINT_BR(status, "Image::Alloc()", GetBaseStatusString(status));

        printf("\nOutput file: %s\n", (sOutputFile.Size()) ? sOutputFile.c_str() : "-");
        printf("Output info: %dx%d %s\n", (int)dstData.m_size.width, (int)dstData.m_size.height, colorFormatName[dstData.m_color]);

        if (iThreads == 1) {
            printf("\nSequential harmonization\n");
            pHarmonize = new Harmonize;
            if (!pHarmonize) {
                PRINT_MESSAGE("Failed to allocate sequential harmonization class");
                return 1;
            }
        } else {
            if (iMethod == CMD_METHOD_VM) {
                printf("\nNative threaded harmonization\n");
                pHarmonize = new HarmonizeVM;
            }
#ifdef USE_TBB
            else if (iMethod == CMD_METHOD_TBB) {
                printf("\nIntel(R) TBB harmonization\n");
                try {
                    pHarmonize = new HarmonizeTBB;
                    if (!pHarmonize) {
                        PRINT_MESSAGE("Failed to allocate Intel TBB harmonization class");
                        return 1;
                    }
                } catch (const std::exception &stdException) {
                    PRINT_MESSAGE(stdException.what());
                    return 1;
                }
            }
#endif
            else {
                PRINT_MESSAGE("Invalid threading method");
                return 1;
            }

            pHarmonize->m_iThreads = iThreads;
        }

        // pre-init
        status = pHarmonize->Init(&srcData, &dstData);
        CHECK_STATUS_PRINT_BR(status, "Harmonize::Init()", GetBaseStatusString(status));

        if (iThreads != 1) {
            iThreads = pHarmonize->m_iThreads;
            printf("Threads: %d\n", iThreads);
        }

        for (iLoops = 1, tickAcc = 0;; iLoops++) {
            tickStart = vm_time_get_tick();
            status = pHarmonize->HarmonizeImage(&srcData, &dstData);
            tickAcc += (vm_time_get_tick() - tickStart);

            CHECK_STATUS_PRINT_BR(status, "Harmonize::HarmonizeImage()", GetBaseStatusString(status));

            fTime = (double)tickAcc / tickFreq;
            if (iLoopsLimit) {
                if (iLoops >= iLoopsLimit)
                    break;
            } else {
                if (fTime >= iTimeLimit)
                    break;
            }
        }
        if (status < 0)
            break;

        /*
        // Results output
        */
        printf("\nLoops:      %d\n", iLoops);
        printf("Time total: %0.3fms\n", fTime);
        printf("Loop avg:   %0.3fms\n", fTime / iLoops);

        if (sOutputFile.Size()) {
            status = dstData.Write(sOutputFile.c_str());
            CHECK_STATUS_PRINT_BR(status, "Image::Write()", GetBaseStatusString(status));
        }

        // Rendering
        if (!bNoWindow) {
            WindowDraw draw("Intel(R) IPP Threaded example");
            if (draw.IsInitialized()) {
                printf("\nPress Space to cycle through stages:\n");
                printf("1 - result image\n");
                printf("2 - original image\n");
                printf("\nClose window to exit.\n");

                int iIndex = 0;
                bool bRedraw = true;
                while (!draw.IsClosed()) {
                    vm_time_sleep(10);
                    if (draw.CheckKey() == KK_SPACE) {
                        iIndex = (iIndex + 1) % 2;
                        bRedraw = true;
                    }
                    if (draw.IsInvalidated())
                        bRedraw = true;

                    if (bRedraw) {
                        if (iIndex == 0)
                            draw.DrawImage(&dstData);
                        else if (iIndex == 1)
                            draw.DrawImage(&srcData);
                        bRedraw = false;
                    }
                }
            }
        }

        break;
    }

    if (pHarmonize)
        delete pHarmonize;
    if (status < 0)
        return status;
    return 0;
}
