/*
 *     Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * NVIDIA CORPORATION and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA CORPORATION is strictly prohibited.
 *
 */

#include "nvhpc_common.h"

/* pi */
#define PI_F 3.1415926535897932384626434f
#define PI_D 3.1415926535897932384626434

/* pi/180 */
#define DEG_TO_RADF 0.0174532925199432957692f
#define DEG_TO_RADD 0.0174532925199432957692
/* 180/pi */
#define RAD_TO_DEGF 57.2957795130823208769f
#define RAD_TO_DEGD 57.2957795130823208769

#define CNVRTDEGF(degrees) ((degrees)*DEG_TO_RADF)
#define CNVRTRADF(radians) ((radians)*RAD_TO_DEGF)

#define CNVRTDEGD(degrees) ((degrees)*DEG_TO_RADD)
#define CNVRTRADD(radians) ((radians)*RAD_TO_DEGD)

#define ERR_PROTYPE const char *__filename, \
                    unsigned int __lineno,  \
                    const char *__function, \
                    int __abort
#define ERR_ARGS __filename, __lineno, __function, __abort
#define ERR_ARGS_MSG __filename, __lineno, __function
#define MAX_ERR_LEN 1024
#define MAX_FILENAME_LEN 256
#define MAX_FUNCTION_LEN 256
#define MAX_ERR_EXPR_LEN 256
#define ERR_MSG_ALLOCATED "FATAL ERROR: ARRAY IS NOT ALLOCATED"
#define ERR_MSG_ALLOCATED_LHS "FATAL ERROR: ARRAY (LEFT HAND SIDE) IS NOT ALLOCATED"
#define ERR_MSG_CONFORMABLE "FATAL ERROR: ARRAY IS NOT CONFORMABLE"
#define ERR_MSG_CONTIGUOUS "FATAL ERROR: NON-CONTIGUOUS ARGUMENT"
#define ERR_MSG_STRIDE1 "FATAL ERROR: NON-STRIDE-ONE ARGUMENT"
#define ERR_MSG_AUTO_ALLOC "FATAL ERROR: DEVICE FORTRAN AUTO ALLOCATION FAILED (USE NVCOMPILER_ACC_CUDA_HEAPSIZE)"
#define ERR_MSG_OUT_OF_RANGE "FATAL ERROR: VECTOR OUT OF RANGE"

#if !defined(PGI_COMPILE_BITCODE)
#undef __DEVICE
#define __DEVICE __ATTRIBUTES
#include "nvhpc_cuda_cpp_builtins.h"
#include "nvhpc_cuda_idx_routines.h"
#else
__DEVICE int __pgi_blockidx(int);
__DEVICE int __pgi_numblocks(int);
__DEVICE int __pgi_numthreads(int);
__DEVICE void __pgi_syncwarp(void);
__DEVICE int __pgi_threadidx(int);
__DEVICE void __pgi_print_string(const signed char *);
__DEVICE void __pgi_print_stringn(const char *);

#include "nvhpc_declarations.h"
extern __device__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val);
extern __device__ double __pgi_atomicAddd(void *address, double val);
#endif /* defined(PGI_COMPILE_BITCODE) */

__ATTRIBUTES unsigned int
__pgi_strlen(const char * str, unsigned int maxlen)
{
  unsigned int len = 0;
  while (*str && len < maxlen) {
    len++;
    str++;
  }

  return len;
}

__ATTRIBUTES void
__pgi_itoa(unsigned int inum, signed char* str)
{
  /* base 10 only */
  int i = 0, start, end;
  char tmp;

  if (inum == 0)
    str[i++] = '0';

  while (inum) {
    unsigned int remain = inum % 10;
    str[i++] = remain + '0';
    inum = inum / 10;
  }

  str[i] = '\0';

  /* reverse the string */
  start = 0;
  end = i - 1;
  while (start < end) {
    tmp = *(str+start);
    *(str+start) = *(str+end);
    *(str+end) = tmp;
    start++;
    end--;
  }
}

__ATTRIBUTES_NORETURN void
__pgi_error_msg(const char *__filename,
                unsigned int __lineno,
                const char *__function,
                const char* errmsg)
{
  signed char buf[MAX_ERR_LEN];
  signed char linenobuf[16];
  unsigned int i, curr, fnamelen, funclen, linenolen, errmsglen;

  fnamelen = __pgi_strlen(__filename, MAX_FILENAME_LEN);
  funclen = __pgi_strlen(__function, MAX_FUNCTION_LEN);
  __pgi_itoa(__lineno, linenobuf);
  linenolen = __pgi_strlen((const char*)linenobuf, 16);
  errmsglen = __pgi_strlen(errmsg, MAX_ERR_EXPR_LEN);

  for (i = 0; i < fnamelen; ++i)
    buf[i] = __filename[i];

  curr = i;
  buf[curr++] = 58; /* : */

  for (i = 0; i < linenolen; ++i)
    buf[curr + i] = linenobuf[i];

  curr = curr + i;
  buf[curr++] = 32; /* SPACE */
  buf[curr++] = 45; /* - */
  buf[curr++] = 32; /* SPACE */

  for (i = 0; i < funclen; ++i)
    buf[curr + i] = __function[i];

  curr = curr + i;
  buf[curr++] = 58; /* : */
  buf[curr++] = 32; /* SPACE */

  for (i = 0; i < errmsglen; ++i)
    buf[curr + i] = errmsg[i];

  curr = curr + i;
  buf[curr++] = '\n';
  buf[curr++] = '\0';
  __pgi_print_string(buf);

  asm("trap;");
}

__ATTRIBUTES_NORETURN void
__pgi_simple_error_msg(const char* errmsg)
{
  __pgi_print_stringn(errmsg);

  asm("trap;");
}

__ATTRIBUTES void
__pgi_throw_out_of_range(ERR_PROTYPE)
{
  if (__abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_OUT_OF_RANGE);
}

struct Ebound {
  unsigned int e0;
  int e1, e2, e3, e4, e5, e6, e7, e8, e9;
};

__ATTRIBUTES int
__pgi_boundcheck(int value, int lb, int extent, int linenum, int ssnum, int n1,
                 int n2, int n3, int ref, void *E_param)
{
  struct Ebound *E;
  E = (struct Ebound *)E_param;
#if !defined(PGI_COMPILE_BITCODE)
  if (value < lb || value - lb >= extent) {
    if (atomicAdd(&(E->e0), 1) == 0) {
      E->e1 = linenum;
      E->e2 = lb;
      E->e3 = extent + lb - 1;
      E->e4 = ssnum;
      E->e5 = value;
      E->e6 = ref;
      E->e7 = n1;
      E->e8 = n2;
      E->e9 = n3;
    }
  }
#endif /* !defined(PGI_COMPILE_BITCODE) */
  return value;
}

/* No branching sign() implementations */
__ATTRIBUTES double
__pgi_dsign(double a, double b)
{
  union {
    long long l;
    uint2  v;
  } ix, iy;
  ix.l = __double_as_longlong(a);
  iy.l = __double_as_longlong(b);
  ix.v.y = (ix.v.y & 0x7FFFFFFF) | (iy.v.y & 0x80000000);
  return __longlong_as_double(ix.l);
}

__ATTRIBUTES float
__pgi_sign(float a, float b)
{
  unsigned int ix, iy;
  ix = (unsigned int) __float_as_int(a);
  iy = (unsigned int) __float_as_int(b);
  ix =  (ix & 0x7FFFFFFF) | (iy & 0x80000000);
  return __int_as_float((int)ix);
}

__ATTRIBUTES int
__pgi_imax(int a, int b)
{
  return max(a, b);
}

__ATTRIBUTES int
__pgi_imin(int a, int b)
{
  return min(a, b);
}

__ATTRIBUTES int
__pgi_uimax(unsigned int a, unsigned int b)
{
  return max(a, b);
}

__ATTRIBUTES int
__pgi_uimin(unsigned int a, unsigned int b)
{
  return min(a, b);
}

__ATTRIBUTES int
__pgi_nint(float a)
{
  return (int)(a + ((a >= 0.0f) ? 0.5f : -0.5f));
}

__ATTRIBUTES long long
__pgi_llnint(float a)
{
  return (long long)(a + ((a >= 0.0f) ? 0.5f : -0.5f));
}

__ATTRIBUTES int
__pgi_ishft(int m, int a)
{
  return (int)((m >= 32)
                   ? a
                   : ((m <= -32) ? a : ((m >= 0) ? (a << m) : (a >> (-m)))));
}

__ATTRIBUTES long long
__pgi_kishft(long long a, int m)
{
  return (long long)(m >= 64
                         ? a
                         : (m <= -64 ? a : (m >= 0 ? (a << m) : (a >> (-m)))));
}

__ATTRIBUTES int
__pgi_ishftc(int val, int sc,
             int rc) /* value, shift count, rightmost bit count */
{
  unsigned int mask = 0xffffffff, field, tmp1, tmp2;
  int n;
  if (rc <= 0)
    return val;
  mask >>= 32 - rc;
  field = val & mask;
  if (sc >= 0) {
    for (n = sc; n >= rc; n -= rc)
      ; /* remainder without % operation */
    if (n == 0)
      return val;
    tmp1 = field << n;
    tmp2 = field >> (rc - n);
  } else {
    for (n = -sc; n >= rc; n -= rc)
      ;
    if (n == 0)
      return val;
    tmp1 = field >> n;
    tmp2 = field << (rc - n);
  }
  return (val ^ field) | ((tmp1 | tmp2) & mask);
}

/* ---- New shfl handling for CUDA 9.0 ---- */

__ATTRIBUTES int
__pgi_shfli2(int var, int srcLane)
{
  int ret, c;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  srcLane--;
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(srcLane), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES int
__pgi_shfli3(int var, int srcLane, int width)
{
  int ret, c;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  srcLane--;
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(srcLane), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES long long
__pgi_shflk2(long long var, int srcLane)
{
  long long ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  srcLane--;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES long long
__pgi_shflk3(long long var, int srcLane, int width)
{
  long long ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  srcLane--;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES void *
__pgi_shflp2(void *var, int srcLane)
{
  void *ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  srcLane--;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES float
__pgi_shflf2(float var, int srcLane)
{
  float ret;
  int c;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  srcLane--;
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(srcLane), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shflf3(float var, int srcLane, int width)
{
  float ret;
  int c;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  srcLane--;
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(srcLane), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES double
__pgi_shfld2(double var, int srcLane)
{
  double ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  srcLane--;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfld3(double var, int srcLane, int width)
{
  double ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  srcLane--;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_upi2(int var, unsigned int delta)
{
  int ret, c;
  unsigned int mask = 0xffffffff;
  c = 0x0;
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_upi3(int var, unsigned int delta, int width)
{
  int ret, c;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8);
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_upf2(float var, unsigned int delta)
{
  float ret;
  int c;
  unsigned int mask = 0xffffffff;
  c = 0x0;
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_upf3(float var, unsigned int delta, int width)
{
  float ret;
  int c;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8);
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_upk2(long long var, unsigned int delta)
{
  long long ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = 0x0;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_upk3(long long var, unsigned int delta, int width)
{
  long long ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8);
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_upd2(double var, unsigned int delta)
{
  double ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = 0x0;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_upd3(double var, unsigned int delta, int width)
{
  double ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8);
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_downi2(int var, unsigned int delta)
{
  int ret;
  int c;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_downi3(int var, unsigned int delta, int width)
{
  int ret;
  int c;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES unsigned short
__pgi_shfl_downhf2(unsigned short var, unsigned int delta)
{
  unsigned short ret;
  unsigned int tmpout;
  unsigned int tmpin;
  int c;
  unsigned int mask = 0xffffffff;

  c = 0x1f;
  tmpin = var;
  tmpin = tmpin << 16;
  tmpin = tmpin | var;

  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(tmpout)
               : "r"(tmpin), "r"(delta), "r"(c), "r"(mask));
  tmpout = tmpout & 0xffff;
  ret = (unsigned short) tmpout;
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_downf2(float var, unsigned int delta)
{
  float ret;
  int c;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_downf3(float var, unsigned int delta, int width)
{
  float ret;
  int c;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_downk2(long long var, unsigned int delta)
{
  long long ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_downk3(long long var, unsigned int delta, int width)
{
  long long ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_downd2(double var, unsigned int delta)
{
  double ret;
  int c, lo, hi;
  c = 0x1f;
  unsigned int mask = 0xffffffff;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_downd3(double var, unsigned int delta, int width)
{
  double ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_xori2(int var, unsigned int laneMask)
{
  int ret, c;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(laneMask), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_xori3(int var, unsigned int laneMask, int width)
{
  int ret, c;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(laneMask), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_xorf2(float var, unsigned int laneMask)
{
  float ret;
  int c;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(laneMask), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_xorf3(float var, unsigned int laneMask, int width)
{
  float ret;
  int c;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(laneMask), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_xork2(long long var, unsigned int laneMask)
{
  long long ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_xork3(long long var, unsigned int laneMask, int width)
{
  long long ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_xord2(double var, unsigned int laneMask)
{
  double ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_xord3(double var, unsigned int laneMask, int width)
{
  double ret;
  int c, lo, hi;
  unsigned int mask = 0xffffffff;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_synci2(unsigned int mask, int var, int srcLane)
{
  int ret, c;
  c = 0x1f;
  srcLane--;
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(srcLane), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_synci3(unsigned int mask, int var, int srcLane, int width)
{
  int ret, c;
  c = ((32 - width) << 8) | 0x1f;
  srcLane--;
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(srcLane), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_synck2(unsigned int mask, long long var, int srcLane)
{
  long long ret;
  int c, lo, hi;
  c = 0x1f;
  srcLane--;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_synck3(unsigned int mask, long long var, int srcLane, int width)
{
  long long ret;
  int c, lo, hi;
  c = ((32 - width) << 8) | 0x1f;
  srcLane--;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES void *
__pgi_shfl_syncp2(unsigned int mask, void *var, int srcLane)
{
  void *ret;
  int c, lo, hi;
  c = 0x1f;
  srcLane--;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_syncf2(unsigned int mask, float var, int srcLane)
{
  float ret;
  int c;
  c = 0x1f;
  srcLane--;
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(srcLane), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_syncf3(unsigned int mask, float var, int srcLane, int width)
{
  float ret;
  int c;
  c = ((32 - width) << 8) | 0x1f;
  srcLane--;
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(srcLane), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_syncd2(unsigned int mask, double var, int srcLane)
{
  double ret;
  int c, lo, hi;
  c = 0x1f;
  srcLane--;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_syncd3(unsigned int mask, double var, int srcLane, int width)
{
  double ret;
  int c, lo, hi;
  c = ((32 - width) << 8) | 0x1f;
  srcLane--;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(srcLane), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

/* --- up --- */
__ATTRIBUTES int
__pgi_shfl_up_synci2(unsigned int mask, int var, unsigned int delta)
{
  int ret, c;
  c = 0x0;
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_up_synci3(unsigned int mask, int var, unsigned int delta, int width)
{
  int ret, c;
  c = ((32 - width) << 8);
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_up_syncf2(unsigned int mask, float var, unsigned int delta)
{
  float ret;
  int c;
  c = 0x0;
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_up_syncf3(unsigned int mask, float var, unsigned int delta,
                     int width)
{
  float ret;
  int c;
  c = ((32 - width) << 8);
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_up_synck2(unsigned int mask, long long var, unsigned int delta)
{
  long long ret;
  int c, lo, hi;
  c = 0x0;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_up_synck3(unsigned int mask, long long var, unsigned int delta,
                     int width)
{
  long long ret;
  int c, lo, hi;
  c = ((32 - width) << 8);
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_up_syncd2(unsigned int mask, double var, unsigned int delta)
{
  double ret;
  int c, lo, hi;
  c = 0x0;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_up_syncd3(unsigned int mask, double var, unsigned int delta,
                     int width)
{
  double ret;
  int c, lo, hi;
  c = ((32 - width) << 8);
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

/* --- down --- */
__ATTRIBUTES int
__pgi_shfl_down_synci2(unsigned int mask, int var, unsigned int delta)
{
  int ret;
  int c;
  c = 0x1f;
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_down_synci3(unsigned int mask, int var, unsigned int delta,
                       int width)
{
  int ret;
  int c;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_down_syncf2(unsigned int mask, float var, unsigned int delta)
{
  float ret;
  int c;
  c = 0x1f;
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_down_syncf3(unsigned int mask, float var, unsigned int delta,
                       int width)
{
  float ret;
  int c;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(delta), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_down_synck2(unsigned int mask, long long var, unsigned int delta)
{
  long long ret;
  int c, lo, hi;
  c = 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_down_synck3(unsigned int mask, long long var, unsigned int delta,
                       int width)
{
  long long ret;
  int c, lo, hi;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_down_syncd2(unsigned int mask, double var, unsigned int delta)
{
  double ret;
  int c, lo, hi;
  c = 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_down_syncd3(unsigned int mask, double var, unsigned int delta,
                       int width)
{
  double ret;
  int c, lo, hi;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(delta), "r"(c), "r"(mask));
  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(delta), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

/* --- xor --- */
__ATTRIBUTES int
__pgi_shfl_xor_synci2(unsigned int mask, int var, unsigned int laneMask)
{
  int ret, c;
  c = 0x1f;
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(laneMask), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES int
__pgi_shfl_xor_synci3(unsigned int mask, int var, unsigned int laneMask,
                      int width)
{
  int ret, c;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(ret)
               : "r"(var), "r"(laneMask), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_xor_syncf2(unsigned int mask, float var, unsigned int laneMask)
{
  float ret;
  int c;
  c = 0x1f;
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(laneMask), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES float
__pgi_shfl_xor_syncf3(unsigned int mask, float var, unsigned int laneMask,
                      int width)
{
  float ret;
  int c;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=f"(ret)
               : "f"(var), "r"(laneMask), "r"(c), "r"(mask));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_xor_synck2(unsigned int mask, long long var, unsigned int laneMask)
{
  long long ret;
  int c, lo, hi;
  c = 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES long long
__pgi_shfl_xor_synck3(unsigned int mask, long long var, unsigned int laneMask,
                      int width)
{
  long long ret;
  int c, lo, hi;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_xor_syncd2(unsigned int mask, double var, unsigned int laneMask)
{
  double ret;
  int c, lo, hi;
  c = 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

__ATTRIBUTES double
__pgi_shfl_xor_syncd3(unsigned int mask, double var, unsigned int laneMask,
                      int width)
{
  double ret;
  int c, lo, hi;
  c = ((32 - width) << 8) | 0x1f;
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(lo)
               : "r"(lo), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
               : "=r"(hi)
               : "r"(hi), "r"(laneMask), "r"(c), "r"(mask));
  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(ret) : "r"(lo), "r"(hi));
  return ret;
}

#if !defined(PGI_COMPILE_BITCODE)
extern "C" __device__ __cudart_builtin__ unsigned long long CUDARTAPI
cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
extern "C" __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaCGSynchronize(unsigned long long handle, unsigned int flags);
extern "C" __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids,
              unsigned long long handle);
extern "C" __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank,
              unsigned long long handle);
#else /* defined(PGI_COMPILE_BITCODE) */
unsigned long long
cudaCGGetIntrinsicHandle(enum __device_builtin__ cudaCGScope);
cudaError_t cudaCGSynchronize(unsigned long long handle, unsigned int flags);
void *memset(void *, int, size_t);
#endif /* defined(PGI_COMPILE_BITCODE) */

typedef struct __pgi_PGI_Grid {
  unsigned long long handle;
  unsigned int size;
  unsigned int rank;
} __pgi_PGI_Grid;

__ATTRIBUTES void
__pgi_grid_this_grid_(signed char *gg)
{
  int __T1;
  __pgi_PGI_Grid *g = (__pgi_PGI_Grid *)gg;
  memset(g, 0, 16ULL);
  g->handle = cudaCGGetIntrinsicHandle(cudaCGScopeGrid);
#if defined(PGI_COMPILE_BITCODE)
  g->size = (((__pgi_numthreads(3) * __pgi_numblocks(3)) *
              (__pgi_numthreads(2) * __pgi_numblocks(2))) *
             (__pgi_numthreads(1) * __pgi_numblocks(1)));
  __T1 = (((__pgi_blockidx(3) * __pgi_numblocks(2)) * __pgi_numblocks(1)) +
          (__pgi_blockidx(2) * __pgi_numblocks(1))) +
         __pgi_blockidx(1);
  g->rank = __T1 * ((__pgi_numthreads(1) * __pgi_numthreads(2)) *
                    __pgi_numthreads(3)) +
            ((__pgi_threadidx(3) * __pgi_numthreads(2)) * __pgi_numthreads(1)) +
            (__pgi_threadidx(2) * __pgi_numthreads(1)) + __pgi_threadidx(1) + 1;
  /* Fortran starts at one!     -------------^ */
#else /* !defined(PGI_COMPILE_BITCODE) */
  g->size = ((((blockDim.z) * (gridDim.z)) * ((blockDim.y) * (gridDim.y))) *
             ((blockDim.x) * (gridDim.x)));
  __T1 = (((blockIdx.z * gridDim.y) * gridDim.x) + (blockIdx.y * gridDim.x)) +
         blockIdx.x;
  g->rank = __T1 * ((blockDim.x * blockDim.y) * blockDim.z) +
            ((threadIdx.z * blockDim.y) * blockDim.x) +
            (threadIdx.y * blockDim.x) + threadIdx.x + 1;
  /* Fortran starts at one!     -------------^ */
#endif /* !defined(PGI_COMPILE_BITCODE) */
  return;
}

__ATTRIBUTES void
__pgi_grid_syncthreads_grid(signed char *gg)
{
  __pgi_PGI_Grid *g = (__pgi_PGI_Grid *)gg;
  unsigned long long __T2 = g->handle;
  cudaCGSynchronize(__T2, 0U);
  return;
}

__ATTRIBUTES void
__pgi_cg_this_thread_block_(signed char *tg)
{
  __pgi_PGI_Grid *g = (__pgi_PGI_Grid *)tg;
  memset(g, 0, 16);
  /* No handle yet, from what I can tell */
#if defined(PGI_COMPILE_BITCODE)
  g->size = __pgi_numthreads(3) * __pgi_numthreads(2) * __pgi_numthreads(1);
  g->rank = ((__pgi_threadidx(3) * __pgi_numthreads(2)) * __pgi_numthreads(1)) +
            (__pgi_threadidx(2) * __pgi_numthreads(1)) + __pgi_threadidx(1) + 1;
  /* Fortran starts at one!     -------------^ */
#else /* !defined(PGI_COMPILE_BITCODE) */
  g->size = blockDim.z * blockDim.y * blockDim.x;
  g->rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
            (threadIdx.y * blockDim.x) + threadIdx.x + 1;
  /* Fortran starts at one!     -------------^ */
#endif /* !defined(PGI_COMPILE_BITCODE) */
  return;
}

__ATTRIBUTES void
__pgi_cg_syncthreads_thread_block(signed char *tg)
{
  unsigned int csize;
  __pgi_PGI_Grid *g = (__pgi_PGI_Grid *)tg;
#if defined(PGI_COMPILE_BITCODE)
  csize = __pgi_numthreads(3) * __pgi_numthreads(2) * __pgi_numthreads(1);
#else /* !defined(PGI_COMPILE_BITCODE) */
  csize = blockDim.z * blockDim.y * blockDim.x;
#endif /* !defined(PGI_COMPILE_BITCODE) */
  if (g->size == csize)
    __syncthreads();
  /* That's all we support right now */
  return;
}

__ATTRIBUTES void
__pgi_cg_this_warp_(signed char *wg)
{
  __pgi_PGI_Grid *g = (__pgi_PGI_Grid *)wg;
  memset(g, 0, 16);
  /* No handle yet */
  g->size = 32;
  g->rank = (__pgi_threadidx(1) & 31) + 1;
  /* Fortran starts at one!     --------^ */
}

__ATTRIBUTES void
__pgi_cg_syncthreads_warp(signed char *wg)
{
  __pgi_PGI_Grid *g = (__pgi_PGI_Grid *)wg;
  unsigned int mask = 0xffffffff;
  if (g->size == 32)
    asm volatile("bar.warp.sync %0;" ::"r"(mask));
  /* That's all we support right now */
  return;
}

__ATTRIBUTES long long
__pgi_kishftc(long long val, int sc,
              int rc) /* value, shift count, rightmost bit count */
{
  unsigned long long mask = 0xffffffffffffffff, field, tmp1, tmp2;
  int n;
  if (rc <= 0)
    return val;
  mask >>= 64 - rc;
  field = val & mask;
  if (sc >= 0) {
    for (n = sc; n >= rc; n -= rc)
      ;
    if (n == 0)
      return val;
    tmp1 = field << n;
    tmp2 = field >> (rc - n);
  } else {
    for (n = -sc; n >= rc; n -= rc)
      ;
    if (n == 0)
      return val;
    tmp1 = field >> n;
    tmp2 = field << (rc - n);
  }
  return (val ^ field) | ((tmp1 | tmp2) & mask);
}

__ATTRIBUTES void
__pgi_sincosf(float x, void *y, void *z)
{
  sincosf(x, (float *)y, (float *)z);
}

#if !defined(PGI_COMPILE_BITCODE)
__ATTRIBUTES void
__pgi_fast_sincosf(float x, void *y, void *z)
{
  __sincosf(x, (float *)y, (float *)z);
}
#endif 

__ATTRIBUTES void
__pgi_sincos(double x, void *y, void *z)
{
  sincos(x, (double *)y, (double *)z);
}

__ATTRIBUTES void
__pgi_sincospif(float x, void *y, void *z)
{
  sincospif(x, (float *)y, (float *)z);
}

__ATTRIBUTES void
__pgi_sincospi(double x, void *y, void *z)
{
  sincospi(x, (double *)y, (double *)z);
}

__ATTRIBUTES long long
__pgi_c_devloc(signed char *x)
{
  return ((long long)x);
}

__ATTRIBUTES long long
__pgi_bktest(long long val, int bit)
{
  return (val << (63 - bit)) >> 63;
}

__ATTRIBUTES long long
__pgi_kibclr(long long val, int bit)
{
  return val & ~(1LL << bit);
}

__ATTRIBUTES long long
__pgi_kibset(long long val, int bit)
{
  return val | (1LL << bit);
}

__ATTRIBUTES long long
__pgi_kibits(long long val, int bit, int numbits)
{
  return ((val >> bit) & (((unsigned long long)-1LL) >> (64 - numbits)));
}

__ATTRIBUTES float
__pgi_rpowk(long long i, float x)
{
  long long k;
  float f;

  f = 1;
  k = i;
  if (k < 0)
    k = -k;
  for (;;) {
    if (k & 1)
      f *= x;
    k >>= 1;
    if (k == 0)
      break;
    x *= x;
  }
  if (i < 0)
    f = 1.0f / f;
  return f;
}

__ATTRIBUTES double
__pgi_dpowk(long long i, double x)
{
  long long k;
  double f;

  f = 1;
  k = i;
  if (k < 0)
    k = -k;
  for (;;) {
    if (k & 1)
      f *= x;
    k >>= 1;
    if (k == 0)
      break;
    x *= x;
  }
  if (i < 0)
    f = 1.0 / f;
  return f;
}

__ATTRIBUTES void
__pgi_cddiv(signed char *res, double real1, double imag1, double real2,
            double imag2)
{
  double x, y, r, d, r_mag, i_mag;
  r_mag = real2;
  if (r_mag < 0)
    r_mag = -r_mag;
  i_mag = imag2;
  if (i_mag < 0)
    i_mag = -i_mag;
  if (r_mag <= i_mag) {
    r = real2 / imag2;
    d = 1.0 / (imag2 * (1.0 + r * r));
    x = (real1 * r + imag1) * d;
    y = (imag1 * r - real1) * d;
  } else {
    r = imag2 / real2;
    d = 1.0 / (real2 * (1.0 + r * r));
    x = (real1 + imag1 * r) * d;
    y = (imag1 - real1 * r) * d;
  }
  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cddivd(signed char *res, double real1, double imag1, double d)
{
  double x, y;
  x = real1 / d;
  y = imag1 / d;
  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cdpowi(signed char *res, double real, double imag, int i)
{
  int k;
  double fr, fi, gr, gi, tr, ti;
  fr = 1.0;
  fi = 0.0;
  gr = real;
  gi = imag;
  k = i;
  if (i < 0)
    k = -i;
  while (k) {
    if (k & 1) {
      tr = fr * gr - fi * gi;
      ti = fr * gi + fi * gr;
      fr = tr;
      fi = ti;
    }
    k = (unsigned)k >> 1;
    tr = gr * gr - gi * gi;
    ti = 2.0 * gr * gi;
    gr = tr;
    gi = ti;
  }
  if (i < 0) {
    __pgi_cddiv(res, 1.0, 0.0, fr, fi);
  } else {
    ((double *)res)[0] = fr;
    ((double *)res)[1] = fi;
  }
}

__ATTRIBUTES void
__pgi_cdpowk(signed char *res, double real, double imag, long long i)
{
  long long k;
  double fr, fi, gr, gi, tr, ti;
  fr = 1.0;
  fi = 0.0;
  gr = real;
  gi = imag;
  k = i;
  if (i < 0)
    k = -i;
  while (k) {
    if (k & 1) {
      tr = fr * gr - fi * gi;
      ti = fr * gi + fi * gr;
      fr = tr;
      fi = ti;
    }
    k = (unsigned long long)k >> 1;
    tr = gr * gr - gi * gi;
    ti = 2.0 * gr * gi;
    gr = tr;
    gi = ti;
  }
  if (i < 0) {
    __pgi_cddiv(res, 1.0, 0.0, fr, fi);
  } else {
    ((double *)res)[0] = fr;
    ((double *)res)[1] = fi;
  }
}

__ATTRIBUTES void
__pgi_zcddiv(double real1, double imag1, double real2, double imag2,
             signed char *res)
{
  double x, y, r, d, r_mag, i_mag;
  r_mag = real2;
  if (r_mag < 0)
    r_mag = -r_mag;
  i_mag = imag2;
  if (i_mag < 0)
    i_mag = -i_mag;
  if (r_mag <= i_mag) {
    r = real2 / imag2;
    d = 1.0 / (imag2 * (1.0 + r * r));
    x = (real1 * r + imag1) * d;
    y = (imag1 * r - real1) * d;
  } else {
    r = imag2 / real2;
    d = 1.0 / (real2 * (1.0 + r * r));
    x = (real1 + imag1 * r) * d;
    y = (imag1 - real1 * r) * d;
  }
  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_zcddivd(double real1, double imag1, double d, signed char *res)
{
  double x, y;
  x = real1 / d;
  y = imag1 / d;
  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_zcdpowi(double real, double imag, int i, signed char *res)
{
  int k;
  double fr, fi, gr, gi, tr, ti;
  fr = 1.0;
  fi = 0.0;
  gr = real;
  gi = imag;
  k = i;
  if (i < 0)
    k = -i;
  while (k) {
    if (k & 1) {
      tr = fr * gr - fi * gi;
      ti = fr * gi + fi * gr;
      fr = tr;
      fi = ti;
    }
    k = (unsigned)k >> 1;
    tr = gr * gr - gi * gi;
    ti = 2.0 * gr * gi;
    gr = tr;
    gi = ti;
  }
  if (i < 0) {
    __pgi_cddiv(res, 1.0, 0.0, fr, fi);
  } else {
    ((double *)res)[0] = fr;
    ((double *)res)[1] = fi;
  }
}

__ATTRIBUTES void
__pgi_zcdpowk(double real, double imag, long long i, signed char *res)
{
  long long k;
  double fr, fi, gr, gi, tr, ti;
  fr = 1.0;
  fi = 0.0;
  gr = real;
  gi = imag;
  k = i;
  if (i < 0)
    k = -i;
  while (k) {
    if (k & 1) {
      tr = fr * gr - fi * gi;
      ti = fr * gi + fi * gr;
      fr = tr;
      fi = ti;
    }
    k = (unsigned long long)k >> 1;
    tr = gr * gr - gi * gi;
    ti = 2.0 * gr * gi;
    gr = tr;
    gi = ti;
  }
  if (i < 0) {
    __pgi_cddiv(res, 1.0, 0.0, fr, fi);
  } else {
    ((double *)res)[0] = fr;
    ((double *)res)[1] = fi;
  }
}

__ATTRIBUTES int
__pgi_dnint(double a)
{
  return (int)(a + ((a >= 0.0) ? 0.5 : -0.5));
}

__ATTRIBUTES long long
__pgi_lldnint(double a)
{
  return (long long)(a + ((a >= 0.0) ? 0.5 : -0.5));
}


__ATTRIBUTES void
__pgi_cdiv(signed char *res, float real1, float imag1, float real2, float imag2)
{
  float x, y, r, d, r_mag, i_mag;
  r_mag = real2;
  if (r_mag < 0)
    r_mag = -r_mag;
  i_mag = imag2;
  if (i_mag < 0)
    i_mag = -i_mag;
  if (r_mag <= i_mag) {
    r = real2 / imag2;
    d = 1.0f / (imag2 * (1.0f + r * r));
    x = (real1 * r + imag1) * d;
    y = (imag1 * r - real1) * d;
  } else {
    r = imag2 / real2;
    d = 1.0f / (real2 * (1.0f + r * r));
    x = (real1 + imag1 * r) * d;
    y = (imag1 - real1 * r) * d;
  }
  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES float
__pgi_cdiv_real(float real1, float imag1, float real2, float imag2)
{
  float x;
  float r, d, r_mag, i_mag;

  r_mag = real2;
  if (r_mag < 0)
    r_mag = -r_mag;
  i_mag = imag2;
  if (i_mag < 0)
    i_mag = -i_mag;
  /* avoid overflow */
  if (r_mag <= i_mag) {
    r = real2 / imag2;
    d = 1.0f / (imag2 * (1 + r * r));
    x = (real1 * r + imag1) * d;
    /*y = (imag1 * r - real1) * d;*/
  } else {
    r = imag2 / real2;
    d = 1.0f / (real2 * (1 + r * r));
    x = (real1 + imag1 * r) * d;
    /*y = (imag1 - real1 * r) * d;*/
  }
  return x;
}

__ATTRIBUTES float
__pgi_cdiv_imag(float real1, float imag1, float real2, float imag2)
{
  float y;
  float r, d, r_mag, i_mag;

  r_mag = real2;
  if (r_mag < 0)
    r_mag = -r_mag;
  i_mag = imag2;
  if (i_mag < 0)
    i_mag = -i_mag;
  /* avoid overflow */
  if (r_mag <= i_mag) {
    r = real2 / imag2;
    d = 1.0f / (imag2 * (1 + r * r));
    /*x = (real1 * r + imag1) * d;*/
    y = (imag1 * r - real1) * d;
  } else {
    r = imag2 / real2;
    d = 1.0f / (real2 * (1 + r * r));
    /*x = (real1 + imag1 * r) * d;*/
    y = (imag1 - real1 * r) * d;
  }
  return y;
}

__ATTRIBUTES void
__pgi_cdsqrt(signed char *res, double real1, double imag1)
{
  double a, x, y;
  a = hypot(real1, imag1);
  if (a == 0.0) {
    x = 0.0;
    y = 0.0;
  } else if (real1 > 0.0) {
    x = sqrt(0.5 * (a + real1));
    y = 0.5 * (imag1 / x);
  } else {
    y = sqrt(0.5 * (a - real1));
    if (imag1 < 0.0)
      y = -y;
    x = 0.5 * (imag1 / y);
  }
  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cdexp(signed char *res, double real1, double imag1)
{
  double x, y, z;
  x = exp(real1);
  sincos(imag1, &z, &y);
  y *= x;
  z *= x;
  ((double *)res)[0] = y;
  ((double *)res)[1] = z;
}

__ATTRIBUTES void
__pgi_cdlog(signed char *res, double real1, double imag1)
{
  double x, y;
  x = atan2(imag1, real1);
  y = log(hypot(real1, imag1));
  ((double *)res)[0] = y;
  ((double *)res)[1] = x;
}

__ATTRIBUTES void
__pgi_cdcos(signed char *res, double real1, double imag1)
{
  double x, y;
  x = cos(real1);
  y = sin(real1);
  x = x * cosh(imag1);
  y = -y * sinh(imag1);
  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cdsin(signed char *res, double real1, double imag1)
{
  double x, y;
  x = sin(real1);
  y = cos(real1);
  x = x * cosh(imag1);
  y = y * sinh(imag1);
  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cdpowcd(signed char *res, double real1, double imag1, double real2,
              double imag2)
{
  double logr, logi, x, y, z, w;
  logr = log(hypot(real1, imag1));
  logi = atan2(imag1, real1);

  x = exp(logr * real2 - logi * imag2);
  y = logr * imag2 + logi * real2;
  z = x * cos(y);
  w = x * sin(y);
  ((double *)res)[0] = z;
  ((double *)res)[1] = w;
}

__ATTRIBUTES void
__pgi_zcdsqrt(double real1, double imag1, signed char *res)
{
  double a, x, y;
  a = hypot(real1, imag1);
  if (a == 0.0) {
    x = 0.0;
    y = 0.0;
  } else if (real1 > 0.0) {
    x = sqrt(0.5 * (a + real1));
    y = 0.5 * (imag1 / x);
  } else {
    y = sqrt(0.5 * (a - real1));
    if (imag1 < 0.0)
      y = -y;
    x = 0.5 * (imag1 / y);
  }
  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_zcdexp(double real1, double imag1, signed char *res)
{
  double x, y, z;
  x = exp(real1);
  sincos(imag1, &z, &y);
  y *= x;
  z *= x;
  ((double *)res)[0] = y;
  ((double *)res)[1] = z;
}

__ATTRIBUTES void
__pgi_zcdlog(double real1, double imag1, signed char *res)
{
  double x, y;
  x = atan2(imag1, real1);
  y = log(hypot(real1, imag1));
  ((double *)res)[0] = y;
  ((double *)res)[1] = x;
}

__ATTRIBUTES void
__pgi_zcdcos(double real1, double imag1, signed char *res)
{
  double x, y;
  x = cos(real1);
  y = sin(real1);
  x = x * cosh(imag1);
  y = -y * sinh(imag1);
  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_zcdsin(double real1, double imag1, signed char *res)
{
  double x, y;
  x = sin(real1);
  y = cos(real1);
  x = x * cosh(imag1);
  y = y * sinh(imag1);
  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_zcdpowcd(double real1, double imag1, double real2, double imag2,
               signed char *res)
{
  double logr, logi, x, y, z, w;
  logr = log(hypot(real1, imag1));
  logi = atan2(imag1, real1);

  x = exp(logr * real2 - logi * imag2);
  y = logr * imag2 + logi * real2;
  z = x * cos(y);
  w = x * sin(y);
  ((double *)res)[0] = z;
  ((double *)res)[1] = w;
}

__ATTRIBUTES double
__pgi_dcdiv_real(double real1, double imag1, double real2, double imag2)
{
  double x;
  double r, d, r_mag, i_mag;

  r_mag = real2;
  if (r_mag < 0)
    r_mag = -r_mag;
  i_mag = imag2;
  if (i_mag < 0)
    i_mag = -i_mag;
  /* avoid overflow */
  if (r_mag <= i_mag) {
    r = real2 / imag2;
    d = 1.0 / (imag2 * (1.0 + r * r));
    x = (real1 * r + imag1) * d;
    /*y = (imag1 * r - real1) * d;*/
  } else {
    r = imag2 / real2;
    d = 1.0 / (real2 * (1.0 + r * r));
    x = (real1 + imag1 * r) * d;
    /*y = (imag1 - real1 * r) * d;*/
  }
  return x;
}

__ATTRIBUTES double
__pgi_dcdiv_imag(double real1, double imag1, double real2, double imag2)
{
  double y;
  double r, d, r_mag, i_mag;

  r_mag = real2;
  if (r_mag < 0)
    r_mag = -r_mag;
  i_mag = imag2;
  if (i_mag < 0)
    i_mag = -i_mag;
  /* avoid overflow */
  if (r_mag <= i_mag) {
    r = real2 / imag2;
    d = 1.0 / (imag2 * (1.0 + r * r));
    /*x = (real1 * r + imag1) * d;*/
    y = (imag1 * r - real1) * d;
  } else {
    r = imag2 / real2;
    d = 1.0 / (real2 * (1.0 + r * r));
    /*x = (real1 + imag1 * r) * d;*/
    y = (imag1 - real1 * r) * d;
  }
  return y;
}

__ATTRIBUTES void
__pgi_cdacos(signed char *res, double real1, double imag1)
{
  /* cacos(z) = -i * clog(z + i * csqrt(1 - z * z)) */
  double x, y;
  dcmplx2 tmp;

  /* z*z */
  x = real1 * real1 - imag1 * imag1;
  y = real1 * imag1 + imag1 * real1;
  x = 1.0 - x;
  y = -y;
  __pgi_cdsqrt((signed char *)(&tmp), x, y);

  /* i*csqrt */
  x = -tmp.i;
  y = tmp.r;

  /* z+ */
  x = real1 + x;
  y = imag1 + y;
  __pgi_cdlog((signed char *)(&tmp), x, y);
  x = tmp.i;
  y = -tmp.r;

  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cdasin(signed char *res, double real1, double imag1)
{
  /* casin(z) = -i * clog((z * i) + csqrt(1 - z * z)) */
  double x, y;
  dcmplx2 tmp;

  /* z*z */
  x = real1 * real1 - imag1 * imag1;
  y = real1 * imag1 + imag1 * real1;
  x = 1.0 - x;
  y = -y;
  __pgi_cdsqrt((signed char *)(&tmp), x, y);

  /* z*i + */
  x = -imag1 + tmp.r;
  y = real1 + tmp.i;

  __pgi_cdlog((signed char *)(&tmp), x, y);
  x = tmp.i;
  y = -tmp.r;

  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}


__ATTRIBUTES void
__pgi_cdatan(signed char *res, double real1, double imag1)
{
  /* cacos(z) = i/2 * ( clog(1 - iz) - clog(1 + iz) )*/
  double x, y, x1, y1, x2, y2;
  dcmplx2 tmp;

  /* iz */
  x = -imag1;
  y = real1;

  /* clog(1-iz) */
  x1 = 1 - x;
  y1 = -y;
  __pgi_cdlog((signed char *)(&tmp), x1, y1);
  x1 = tmp.r;
  y1 = tmp.i;

  /* clog(1+iz) */
  x2 = 1 + x;
  y2 = y;
  __pgi_cdlog((signed char *)(&tmp), x2, y2);
  x2 = tmp.r;
  y2 = tmp.i;

  /* i/2 * clog(1 - iz) - clog(1 + iz) */
  x = -((y1 - y2) * 0.5);
  y = (x1 - x2) * 0.5;

  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cdtan(signed char *res, double real1, double imag1)
{
  /* ctan(z) = csin(z) / ccos(z) */
  double x, y, x1, y1;
  dcmplx2 tmp;

  __pgi_cdsin((signed char *)(&tmp), real1, imag1);
  x = tmp.r;
  y = tmp.i;
  __pgi_cdcos((signed char *)(&tmp), real1, imag1);
  x1 = tmp.r;
  y1 = tmp.i;
  __pgi_cddiv((signed char *)(&tmp), x, y, x1, y1);
  x = tmp.r;
  y = tmp.i;

  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cdcosh(signed char *res, double real1, double imag1)
{
  /* (exp(z)+exp(-z))/2 */
  double x, y;
  dcmplx2 tmp;

  __pgi_cdexp((signed char *)(&tmp), real1, imag1);
  x = tmp.r;
  y = tmp.i;
  __pgi_cdexp((signed char *)(&tmp), -real1, -imag1);
  x += tmp.r;
  y += tmp.i;

  x *= 0.5;
  y *= 0.5;

  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cdsinh(signed char *res, double real1, double imag1)
{
  /* (exp(z)-exp(-z))/2 */
  double x, y;
  dcmplx2 tmp;

  __pgi_cdexp((signed char *)(&tmp), real1, imag1);
  x = tmp.r;
  y = tmp.i;
  __pgi_cdexp((signed char *)(&tmp), -real1, -imag1);
  x -= tmp.r;
  y -= tmp.i;

  x *= 0.5;
  y *= 0.5;

  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cdtanh(signed char *res, double real1, double imag1)
{
  /* ctanh(z) = csinh(z) / ccosh(z) */
  double x, y, x1, y1;
  dcmplx2 tmp;

  __pgi_cdsinh((signed char *)(&tmp), real1, imag1);
  x = tmp.r;
  y = tmp.i;
  __pgi_cdcosh((signed char *)(&tmp), real1, imag1);
  x1 = tmp.r;
  y1 = tmp.i;
  __pgi_cddiv((signed char *)(&tmp), x, y, x1, y1);
  x = tmp.r;
  y = tmp.i;

  ((double *)res)[0] = x;
  ((double *)res)[1] = y;
}

__ATTRIBUTES double
__pgi_wrap_bessel_jn(int n, double x)
{
  if (n < 0) {
    int nn = -n;
    if ((nn/2)*2==nn)
      return jn(nn,x);
    else
      return -jn(nn,x);
  } else {
    return jn(n,x);
  }
}


__ATTRIBUTES void
__pgi_csqrt(signed char *res, float real1, float imag1)
{
  float a, x, y;
  a = hypotf(real1, imag1);
  if (a == 0.0f) {
    x = 0.0f;
    y = 0.0f;
  } else if (real1 > 0.0f) {
    x = sqrtf(0.5f * (a + real1));
    y = 0.5f * (imag1 / x);
  } else {
    y = sqrtf(0.5f * (a - real1));
    if (imag1 < 0.0f)
      y = -y;
    x = 0.5f * (imag1 / y);
  }
  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cexp(signed char *res, float real1, float imag1)
{
  float x, y, z;
  x = expf(real1);
  sincosf(imag1, &z, &y);
  y *= x;
  z *= x;
  ((float *)res)[0] = y;
  ((float *)res)[1] = z;
}

__ATTRIBUTES void
__pgi_clog(signed char *res, float real1, float imag1)
{
  float x, y;
  x = atan2f(imag1, real1);
  y = logf(hypotf(real1, imag1));
  ((float *)res)[0] = y;
  ((float *)res)[1] = x;
}

__ATTRIBUTES void
__pgi_cacos(signed char *res, float real1, float imag1)
{
  /* cacos(z) = -i * clog(z + i * csqrt(1 - z * z)) */
  float x, y;
  cmplx2 tmp;

  /* z*z */
  x = real1 * real1 - imag1 * imag1;
  y = real1 * imag1 + imag1 * real1;
  x = 1.0f - x;
  y = -y;
  __pgi_csqrt((signed char *)(&tmp), x, y);

  /* i*csqrt */
  x = -tmp.i;
  y = tmp.r;

  /* z+ */
  x = real1 + x;
  y = imag1 + y;
  __pgi_clog((signed char *)(&tmp), x, y);
  x = tmp.i;
  y = -tmp.r;

  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_casin(signed char *res, float real1, float imag1)
{
  /* casin(z) = -i * clog((z * i) + csqrt(1 - z * z)) */
  float x, y;
  cmplx2 tmp;

  /* z*z */
  x = real1 * real1 - imag1 * imag1;
  y = real1 * imag1 + imag1 * real1;
  x = 1.0f - x;
  y = -y;
  __pgi_csqrt((signed char *)(&tmp), x, y);

  /* z*i + */
  x = -imag1 + tmp.r;
  y = real1 + tmp.i;

  __pgi_clog((signed char *)(&tmp), x, y);
  x = tmp.i;
  y = -tmp.r;

  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_catan(signed char *res, float real1, float imag1)
{
  /* cacos(z) = i/2 * ( clog(1 - iz) - clog(1 + iz) )*/
  float x, y, x1, y1, x2, y2;
  cmplx2 tmp;

  /* iz */
  x = -imag1;
  y = real1;

  /* clog(1-iz) */
  x1 = 1 - x;
  y1 = -y;
  __pgi_clog((signed char *)(&tmp), x1, y1);
  x1 = tmp.r;
  y1 = tmp.i;

  /* clog(1+iz) */
  x2 = 1 + x;
  y2 = y;
  __pgi_clog((signed char *)(&tmp), x2, y2);
  x2 = tmp.r;
  y2 = tmp.i;

  /* i/2 * clog(1 - iz) - clog(1 + iz) */
  x = -((y1 - y2) * 0.5f);
  y = (x1 - x2) * 0.5f;

  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_ccos(signed char *res, float real1, float imag1)
{
  float x, y;
  x = cosf(real1);
  y = sinf(real1);
  x = x * coshf(imag1);
  y = -y * sinhf(imag1);
  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_csin(signed char *res, float real1, float imag1)
{
  float x, y;
  x = sinf(real1);
  y = cosf(real1);
  x = x * coshf(imag1);
  y = y * sinhf(imag1);
  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_ctan(signed char *res, float real1, float imag1)
{
  /* ctan(z) = csin(z) / ccos(z) */
  float x, y, x1, y1;
  cmplx2 tmp;

  __pgi_csin((signed char *)(&tmp), real1, imag1);
  x = tmp.r;
  y = tmp.i;
  __pgi_ccos((signed char *)(&tmp), real1, imag1);
  x1 = tmp.r;
  y1 = tmp.i;
  __pgi_cdiv((signed char *)(&tmp), x, y, x1, y1);
  x = tmp.r;
  y = tmp.i;

  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_ccosh(signed char *res, float real1, float imag1)
{
  /* (exp(z)+exp(-z))/2 */
  float x, y;
  cmplx2 tmp;

  __pgi_cexp((signed char *)(&tmp), real1, imag1);
  x = tmp.r;
  y = tmp.i;
  __pgi_cexp((signed char *)(&tmp), -real1, -imag1);
  x += tmp.r;
  y += tmp.i;

  x *= 0.5f;
  y *= 0.5f;

  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_csinh(signed char *res, float real1, float imag1)
{
  /* (exp(z)-exp(-z))/2 */
  float x, y;
  cmplx2 tmp;

  __pgi_cexp((signed char *)(&tmp), real1, imag1);
  x = tmp.r;
  y = tmp.i;
  __pgi_cexp((signed char *)(&tmp), -real1, -imag1);
  x -= tmp.r;
  y -= tmp.i;

  x *= 0.5f;
  y *= 0.5f;

  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_ctanh(signed char *res, float real1, float imag1)
{
  /* ctanh(z) = csinh(z) / ccosh(z) */
  float x, y, x1, y1;
  cmplx2 tmp;

  __pgi_csinh((signed char *)(&tmp), real1, imag1);
  x = tmp.r;
  y = tmp.i;
  __pgi_ccosh((signed char *)(&tmp), real1, imag1);
  x1 = tmp.r;
  y1 = tmp.i;
  __pgi_cdiv((signed char *)(&tmp), x, y, x1, y1);
  x = tmp.r;
  y = tmp.i;

  ((float *)res)[0] = x;
  ((float *)res)[1] = y;
}

__ATTRIBUTES void
__pgi_cpowi(signed char *res, float real, float imag, int i)
{
  int k;
  float fr, fi, gr, gi, tr, ti;
  fr = 1.0f;
  fi = 0.0f;
  gr = real;
  gi = imag;
  k = i;
  if (i < 0)
    k = -i;
  while (k) {
    if (k & 1) {
      tr = fr * gr - fi * gi;
      ti = fr * gi + fi * gr;
      fr = tr;
      fi = ti;
    }
    k = (unsigned)k >> 1;
    tr = gr * gr - gi * gi;
    ti = 2.0f * gr * gi;
    gr = tr;
    gi = ti;
  }
  if (i < 0) {
    __pgi_cdiv(res, 1.0f, 0.0f, fr, fi);
  } else {
    ((float *)res)[0] = fr;
    ((float *)res)[1] = fi;
  }
}

__ATTRIBUTES void
__pgi_cpowk(signed char *res, float real, float imag, long long i)
{
  long long k;
  float fr, fi, gr, gi, tr, ti;
  fr = 1.0f;
  fi = 0.0f;
  gr = real;
  gi = imag;
  k = i;
  if (i < 0)
    k = -i;
  while (k) {
    if (k & 1) {
      tr = fr * gr - fi * gi;
      ti = fr * gi + fi * gr;
      fr = tr;
      fi = ti;
    }
    k = (unsigned long long)k >> 1;
    tr = gr * gr - gi * gi;
    ti = 2.0f * gr * gi;
    gr = tr;
    gi = ti;
  }
  if (i < 0) {
    __pgi_cdiv(res, 1.0f, 0.0f, fr, fi);
  } else {
    ((float *)res)[0] = fr;
    ((float *)res)[1] = fi;
  }
}

__ATTRIBUTES void
__pgi_cpowc(signed char *res, float real1, float imag1, float real2,
            float imag2)
{
  float logr, logi, x, y, z, w;
  logr = logf(hypotf(real1, imag1));
  logi = atan2f(imag1, real1);

  x = expf(logr * real2 - logi * imag2);
  y = logr * imag2 + logi * real2;
  z = x * cosf(y);
  w = x * sinf(y);
  ((float *)res)[0] = z;
  ((float *)res)[1] = w;
}

__ATTRIBUTES int
__pgi_ipowi(int i, int x)
{
  int f, j;
  if (x == 2) {
    if (i >= 0)
      return 1 << i;
    return 0;
  }
  if (i < 0) {
    if (x == 1 || x == -1) {
      if (i & 1)
        return x;
      return 1;
    }
    return 0;
  }

  j = i & 1;             // j is 1 or 0
  f = (j * x) + (1 - j); // x or 1
  while ((i >>= 1)) {
    x *= x;
    j = i & 1;              // j is 1 or 0
    f *= (j * x) + (1 - j); // *x or *1
  }
  return f;
}

__ATTRIBUTES float
__pgi_wrap_bessel_jnf(int n, float x)
{
  if (n < 0) {
    int nn = -n;
    if ((nn/2)*2==nn)
      return jnf(nn,x);
    else
      return -jnf(nn,x);
  } else {
    return jnf(n,x);
  }
}

__ATTRIBUTES float
__pgi_acosd(float x)
{
  return CNVRTRADF(acosf(x));
}

__ATTRIBUTES float
__pgi_asind(float x)
{
  return CNVRTRADF(asinf(x));
}

__ATTRIBUTES float
__pgi_atan2d(float x, float y)
{
  return CNVRTRADF(atan2f(x, y));
}

__ATTRIBUTES float
__pgi_atand(float x)
{
  return CNVRTRADF(atanf(x));
}

__ATTRIBUTES float
__pgi_cosd(float x)
{
  return cosf(CNVRTDEGF(x));
}

__ATTRIBUTES float
__pgi_sind(float x)
{
  return sinf(CNVRTDEGF(x));
}

__ATTRIBUTES float
__pgi_tand(float x)
{
  return tanf(CNVRTDEGF(x));
}

__ATTRIBUTES double
__pgi_dacosd(double x)
{
  return CNVRTRADD(acos(x));
}

__ATTRIBUTES double
__pgi_dasind(double x)
{
  return CNVRTRADD(asin(x));
}

__ATTRIBUTES double
__pgi_datan2d(double x, double y)
{
  return CNVRTRADD(atan2(x, y));
}

__ATTRIBUTES double
__pgi_datand(double x)
{
  return CNVRTRADD(atan(x));
}

__ATTRIBUTES double
__pgi_dcosd(double x)
{
  return cos(CNVRTDEGD(x));
}

__ATTRIBUTES double
__pgi_dsind(double x)
{
  return sin(CNVRTDEGD(x));
}

__ATTRIBUTES double
__pgi_dtand(double x)
{
  return tan(CNVRTDEGD(x));
}

__ATTRIBUTES int
__pgi_idim(int x, int y)
{
  x = x - y;
  if (x < 0)
    x = 0;
  return x;
}

__ATTRIBUTES long long
__pgi_kdim(long long x, long long y)
{
  x = x - y;
  if (x < 0)
    x = 0;
  return x;
}

__ATTRIBUTES float
__pgi_fdim(float x, float y)
{
  x = x - y;
  if (x < 0)
    x = 0;
  return x;
}

__ATTRIBUTES double
__pgi_ddim(double x, double y)
{
  x = x - y;
  if (x < 0)
    x = 0;
  return x;
}

__ATTRIBUTES int
__pgi_iffloorx(float x)
{
  int i = x;
  if (x < 0 && i != x)
    i = i - 1;
  return i;
}

__ATTRIBUTES int
__pgi_idfloorx(double x)
{
  int i = x;
  if (x < 0 && i != x)
    i = i - 1;
  return i;
}

__ATTRIBUTES long long
__pgi_kffloorx(float x)
{
  long long i = x;
  if (x < 0 && i != x)
    i = i - 1;
  return i;
}

__ATTRIBUTES long long
__pgi_kdfloorx(double x)
{
  long long i = x;
  if (x < 0 && i != x)
    i = i - 1;
  return i;
}

__ATTRIBUTES int
__pgi_ifceilingx(float x)
{
  int i = x;
  if (x > 0 && i != x)
    i = i + 1;
  return i;
}

__ATTRIBUTES int
__pgi_idceilingx(double x)
{
  int i = x;
  if (x > 0 && i != x)
    i = i + 1;
  return i;
}

__ATTRIBUTES long long
__pgi_kfceilingx(float x)
{
  long long i = x;
  if (x > 0 && i != x)
    i = i + 1;
  return i;
}

__ATTRIBUTES long long
__pgi_kdceilingx(double x)
{
  long long i = x;
  if (x > 0 && i != x)
    i = i + 1;
  return i;
}

__ATTRIBUTES int
__pgi_imodulox(int x, int y)
{
/* For NVBug 3766701, tried a few different formulations.  Last one is slightly best.
 *   int q = x / y;
 *   int r = x - q * y;
 *   if (r != 0 && ((x < 0 && y > 0) || (x > 0 && y < 0)))
 *     r += y;
 *   return r;
 */
/*
 *   int r = x % y;
 *   if (r == 0) return 0;
 *   if ((x ^ y) >= 0) return r;
 *     return r+y;
 */
  int r = x % y;
  if (r == 0)
    ;   
  else if ((x ^ y) < 0) /* signs differ */
    r += y;
  return r;

}

__ATTRIBUTES int
__pgi_jmodulox(short x, short y)
{
  int q = x / y;
  int r = x - q * y;
  if (r != 0 && (x ^ y) < 0) /* signs differ */
    r += y;
  return r;
}

__ATTRIBUTES long long
__pgi_kmodulox(long x, long y)
{
  long long q = x / y;
  long long r = x - q * y;
  if (r != 0 && (x ^ y) < 0) /* signs differ */
    r += y;
  return r;
}

__ATTRIBUTES float
__pgi_fmodulox(float x, float y)
{
  float r = fmodf(x, y);
  if (r != 0 && ((x < 0 && y > 0) || (x > 0 && y < 0)))
    r += y;
  return r;
}

__ATTRIBUTES double
__pgi_dmodulox(double x, double y)
{
  double r = fmod(x, y);
  if (r != 0 && ((x < 0 && y > 0) || (x > 0 && y < 0)))
    r += y;
  return r;
}

__ATTRIBUTES int
__pgi_poppari(int x)
{
  return (__popc(x) & 1);
}

__ATTRIBUTES long long
__pgi_popparul(long long x)
{
  return ((long long) (__popcll(x) & 1));
}

__ATTRIBUTES long long
__pgi_popcntul(long long x)
{
  return ((long long) __popcll(x));
}

__ATTRIBUTES int
__pgi_exponhf(unsigned short f)
{
  int hfval = f & 0xFFFF;
  if ((hfval & 0x7FFF) == 0)
    return 0;
  else
    return ((hfval >> 10) & 0x1F) - 14;
}

__ATTRIBUTES int
__pgi_exponf(float f)
{
  int i;
  i = __float_as_int(f);
  if ((i & ~0x80000000) == 0)
    return 0;
  else
    return ((i >> 23) & 0xff) - 126;
}

__ATTRIBUTES long long
__pgi_kexponf(float f)
{
  int i;
  i = __float_as_int(f);
  if ((i & ~0x80000000) == 0)
    return 0;
  else
    return ((i >> 23) & 0xff) - 126;
}

__ATTRIBUTES int
__pgi_expond(double d)
{
  long long i;
  i = __double_as_longlong(d);
  if ((i & ~0x8000000000000000LL) == 0)
    return 0;
  else
    return ((i >> 52) & 0x7ff) - 1022;
}

__ATTRIBUTES long long
__pgi_kexpond(double d)
{
  long long i;
  i = __double_as_longlong(d);
  if ((i & ~0x8000000000000000LL) == 0)
    return 0;
  else
    return ((i >> 52) & 0x7ff) - 1022;
}

__ATTRIBUTES int
__pgi_kcmp(long long a, long long b)
{
  if (a == b)
    return 0;
  if (a < b)
    return -1;
  return 1;
}

__ATTRIBUTES int
__pgi_kcmpz(long long a)
{
  if (a == 0)
    return 0;
  if (a < 0)
    return -1;
  return 1;
}

__ATTRIBUTES int
__pgi_kucmp(unsigned long long a, unsigned long long b)
{
  if (a == b)
    return 0;
  if (a < b)
    return -1;
  return 1;
}

__ATTRIBUTES int
__pgi_kucmpz(unsigned long long a)
{
  if (a == 0)
    return 0;
  return 1;
}

__ATTRIBUTES float
__pgi_fracf(float f)
{
  if (f != 0) {
    int i;
    i = __float_as_int(f);
    i &= ~0x7f800000;
    i |= 0x3f000000;
    f = __int_as_float(i);
  }
  return f;
}

__ATTRIBUTES double
__pgi_fracd(double f)
{
  if (f != 0) {
    long long i;
    i = __double_as_longlong(f);
    i &= ~0x7ff0000000000000ll;
    i |= 0x3fe0000000000000ll;
    f = __longlong_as_double(i);
  }
  return f;
}

__ATTRIBUTES float
__pgi_nearestf(float f, int sign)
{
  /* sign is nonzero meaning nearest in the positive direction */
  int i;
  if (f == 0) {
    /* smallest positive or negative number */
    i = sign ? 0x00100000 : 0x80100000;
    f = __int_as_float(i);
  } else {
    i = __float_as_int(f);
    if ((i & 0x7f800000) != 0x7f800000) {
      /* not nan or inf */
      if ((f < 0) ^ (sign != 0))
        ++i;
      else
        --i;
      f = __int_as_float(i);
    }
  }
  return f;
}

__ATTRIBUTES double
__pgi_nearestd(double f, int sign)
{
  /* sign is nonzero meaning nearest in the positive direction */
  long long i;
  if (f == 0) {
    /* smallest positive or negative number */
    i = sign ? 0x0010000000000000ll : 0x8010000000000000ll;
    f = __longlong_as_double(i);
  } else {
    i = __double_as_longlong(f);
    if ((i & 0x7ff0000000000000ll) != 0x7ff0000000000000ll) {
      /* not nan or inf */
      if ((f < 0) ^ (sign != 0))
        ++i;
      else
        --i;
      f = __longlong_as_double(i);
    }
  }
  return f;
}

__ATTRIBUTES float
__pgi_rrspacingf(float f)
{
  if (f != 0) {
    int i, j;
    i = __float_as_int(f);
    j = (i & 0x7f800000) ^ 0x7f800000;
    f *= __int_as_float(j);
    if (f < 0)
      f = -f;
    j = (22 + 127) << 23;
    f *= __int_as_float(j);
  }
  return f;
}

__ATTRIBUTES double
__pgi_rrspacingd(double f)
{
  if (f != 0) {
    long long i, j;
    i = __double_as_longlong(f);
    j = (i & 0x7ff0000000000000ll) ^ 0x7ff0000000000000ll;
    f *= __longlong_as_double(j);
    if (f < 0)
      f = -f;
    j = (long long)(51 + 1023) << 52;
    f *= __longlong_as_double(j);
  }
  return f;
}

__ATTRIBUTES float
__pgi_scalef(float f, int i)
{
  int e;
  e = i + 127;
  if (e < 0)
    e = 0;
  else if (e > 255)
    e = 255;
  e = e << 23;
  return f * __int_as_float(e);
}

__ATTRIBUTES double
__pgi_scaled(double f, int i)
{
  long long e;
  e = i + 1023;
  if (e < 0)
    e = 0;
  else if (e > 2047)
    e = 2047;
  e = e << 52;
  return f * __longlong_as_double(e);
}

__ATTRIBUTES float
__pgi_setexpf(float f, int j)
{
  int e;
  if (f != 0) {
    int i;
    i = __float_as_int(f);
    i &= ~0x7f800000;
    i |= 0x3f800000;
    e = j + 126;
    if (e < 0)
      e = 0;
    else if (e > 255)
      e = 255;
    e = e << 23;
    f = __int_as_float(e) * __int_as_float(i);
  }
  return f;
}

__ATTRIBUTES double
__pgi_setexpd(double f, int j)
{
  long long e, i;
  if (f != 0) {
    i = __double_as_longlong(f);
    i &= ~0x7ff0000000000000ll;
    i |= 0x3ff0000000000000ll;
    e = j + 1022;
    if (e < 0)
      e = 0;
    else if (e > 2047)
      e = 2047;
    e = e << 52;
    f = __longlong_as_double(e) * __longlong_as_double(i);
  }
  return f;
}

__ATTRIBUTES float
__pgi_spacingf(float f)
{
  int e, i;
  i = __float_as_int(f);
  e = ((i >> 23) & 0xff) - 23;
  if (e < 1)
    e = 1;
  i = e << 23;
  return __int_as_float(i);
}

#define F90_DESC_MAXDIMS 7

struct F90_DescDim {
  int lbound;
  int extent;
  int sstride;
  int soffset;
  int lstride;
  int ubound;
};

struct F90_DescDim_la {
  long long lbound;
  long long extent;
  long long sstride;
  long long soffset;
  long long lstride;
  long long ubound;
};

typedef struct F90_Desc {
  int tag;
  int rank;
  int kind;
  int len;
  int flags;
  int lsize;
  int gsize;
  int lbase;
  int *gbase;
  int *unused;
  struct F90_DescDim dim[F90_DESC_MAXDIMS];
} F90_Desc;

typedef struct F90_Desc_la {
  long long tag;
  long long rank;
  long long kind;
  long long len;
  long long flags;
  long long lsize;
  long long gsize;
  long long lbase;
  long long *gbase;
  long long *unused;
  struct F90_DescDim_la dim[F90_DESC_MAXDIMS];
} F90_Desc_la;

#define __DESC 35

#define BOGUSFLAG 0x100
#define __TEMPLATE 0x00010000
#define __OFF_TEMPLATE 0x00080000
#define __SECTZBASE 0x00400000
#define __BOGUSBOUNDS 0x00800000
#define __NOT_COPIED 0x01000000
#define __NOREINDEX 0x02000000
#define __SEQUENTIAL_SECTION 0x20000000

__ATTRIBUTES int
pgf90_tsection(F90_Desc *d, int dx, F90_Desc *a, int ax, int lb, int ub, int st,
               int gsize, int flags)
{
  int extent;
  extent = ub - lb + st;
  if (st != 1) {
    if (st == -1) {
      extent = -extent;
    } else {
      extent /= st;
    }
  }
  if (extent < 0)
    extent = 0;
  d->dim[dx].lbound = 1;
  d->dim[dx].ubound = extent;
  d->dim[dx].extent = extent;
  d->dim[dx].sstride = 1;
  d->dim[dx].soffset = 0;
  d->dim[dx].lstride = a->dim[ax].lstride * st;
  d->lbase -= d->dim[dx].lstride;
  if (d->dim[dx].lstride != gsize)
    d->flags &= ~(__SEQUENTIAL_SECTION);
  gsize *= extent;
  return gsize;
}

__ATTRIBUTES long long
pgf90_tsection_la(F90_Desc_la *d, int dx, F90_Desc_la *a, int ax, long long lb,
                  long long ub, long long st, long long gsize, long long flags)
{
  long long extent;
  extent = ub - lb + st;
  if (st != 1) {
    if (st == -1) {
      extent = -extent;
    } else {
      extent /= st;
    }
  }
  if (extent < 0)
    extent = 0;
  d->dim[dx].lbound = 1;
  d->dim[dx].ubound = extent;
  d->dim[dx].extent = extent;
  d->dim[dx].sstride = 1;
  d->dim[dx].soffset = 0;
  d->dim[dx].lstride = a->dim[ax].lstride * st;
  d->lbase -= d->dim[dx].lstride;
  if (d->dim[dx].lstride != gsize)
    d->flags &= ~(__SEQUENTIAL_SECTION);
  gsize *= extent;
  return gsize;
}

__ATTRIBUTES int
pgf90_asection(F90_Desc *d, int dx, F90_Desc *a, int ax, int lb, int ub, int st,
               int gsize, int flags)
{
  int extent, noreindex, myoffset;
  extent = ub - lb + st;
  noreindex = flags & __NOREINDEX;
  if (st != 1) {
    if (st == -1) {
      extent = -extent;
    } else {
      extent /= st;
    }
  }
  if (extent < 0)
    extent = 0;
  if (noreindex && st == 1) {
    d->dim[dx].lbound = 1;
    d->dim[dx].ubound = (extent == 0 ? lb - 1 : ub);
    d->dim[dx].extent = d->dim[dx].ubound;
    myoffset = 0;
  } else {
    d->dim[dx].lbound = 1;
    d->dim[dx].ubound = extent;
    d->dim[dx].extent = extent;
    myoffset = lb - st;
  }
  d->dim[dx].sstride = 1;
  d->dim[dx].soffset = 0;
  d->dim[dx].lstride = a->dim[ax].lstride * st;
  d->lbase += myoffset * d->dim[dx].lstride;
  if (d->dim[dx].lstride != gsize)
    d->flags &= ~(__SEQUENTIAL_SECTION);
  gsize *= extent;
  return gsize;
}

__ATTRIBUTES long long
pgf90_asection_la(F90_Desc_la *d, int dx, F90_Desc_la *a, int ax, long long lb,
                  long long ub, long long st, long long gsize, long long flags)
{
  long long extent, myoffset;
  int noreindex;
  extent = ub - lb + st;
  noreindex = flags & __NOREINDEX;
  if (st != 1) {
    if (st == -1) {
      extent = -extent;
    } else {
      extent /= st;
    }
  }
  if (extent < 0)
    extent = 0;
  if (noreindex && st == 1) {
    d->dim[dx].lbound = 1;
    d->dim[dx].ubound = (extent == 0 ? lb - 1 : ub);
    d->dim[dx].extent = d->dim[dx].ubound;
    myoffset = 0;
  } else {
    d->dim[dx].lbound = 1;
    d->dim[dx].ubound = extent;
    d->dim[dx].extent = extent;
    myoffset = lb - st;
  }
  d->dim[dx].sstride = 1;
  d->dim[dx].soffset = 0;
  d->dim[dx].lstride = a->dim[ax].lstride * st;
  d->lbase += myoffset * d->dim[dx].lstride;
  if (d->dim[dx].lstride != gsize)
    d->flags &= ~(__SEQUENTIAL_SECTION);
  gsize *= extent;
  return gsize;
}

__ATTRIBUTES void
pgf90_init_section(F90_Desc *d, int rank, F90_Desc *a)
{
  d->tag = __DESC;
  d->rank = rank;
  d->kind = a->kind;
  d->len = a->len;
  d->flags = a->flags;
  d->gsize = a->gsize;
  d->lsize = a->lsize;
  d->gbase = a->gbase;
  d->lbase = a->lbase;
}

__ATTRIBUTES void
pgf90_init_section_la(struct F90_Desc_la *d, int rank, struct F90_Desc_la *a)
{
  d->tag = __DESC;
  d->rank = rank;
  d->kind = a->kind;
  d->len = a->len;
  d->flags = a->flags;
  d->gsize = a->gsize;
  d->lsize = a->lsize;
  d->gbase = a->gbase;
  d->lbase = a->lbase;
}

__ATTRIBUTES void
pgf90_init_descriptor(F90_Desc *d, int rank, int kind, int len, int flags)
{
  d->tag = __DESC;
  d->rank = rank;
  d->kind = kind;
  d->len = len;
  d->flags = flags | __TEMPLATE | __SEQUENTIAL_SECTION;
  d->flags &= ~(__NOT_COPIED | __OFF_TEMPLATE);
  d->gsize = 0;
  d->lsize = 0;
  d->gbase = 0;
  d->lbase = 1;
}

__ATTRIBUTES void
pgf90_init_descriptor_la(F90_Desc_la *d, long long rank, long long kind, long long len, long long flags)
{
  d->tag = __DESC;
  d->rank = rank;
  d->kind = kind;
  d->len = len;
  d->flags = flags | __TEMPLATE | __SEQUENTIAL_SECTION;
  d->flags &= ~(__NOT_COPIED | __OFF_TEMPLATE);
  d->gsize = 0;
  d->lsize = 0;
  d->gbase = 0;
  d->lbase = 1;
}

__ATTRIBUTES void
pgf90_fort_set_single(F90_Desc *d, F90_Desc *a, int dim, int idx)
{
  int k;
  struct F90_DescDim *ad = &((a)->dim[dim]);
  k = (ad->lstride) * (idx - (ad->lbound));
  d->lbase = d->lbase + k + ((ad->lstride) * (ad->lbound));
}

__ATTRIBUTES void
pgf90_fort_set_single_la(F90_Desc_la *d, F90_Desc_la *a, int dim, long long idx)
{
  long long k;
  struct F90_DescDim_la *ad = &((a)->dim[dim]);
  k = (ad->lstride) * (idx - (ad->lbound));
  d->lbase = d->lbase + k + ((ad->lstride) * (ad->lbound));
}

__ATTRIBUTES int
pgf90_bogus_sect(F90_Desc *d, F90_Desc *a, int flags, int *lower, int *upper,
                 int *stride)
{
  int gsize, ax, dx;
  d->flags |= __BOGUSBOUNDS;
  gsize = 1;
  dx = 0;
  for (ax = 0; ax < a->rank; ++ax) {
    if (flags & (1 << ax)) {
      d->dim[dx].lbound = lower[dx];
      d->dim[dx].ubound = upper[dx];
      d->dim[dx].extent = upper[dx] - lower[dx] + 1;
      d->dim[dx].lstride = stride[dx];
      if (d->dim[dx].lstride != gsize)
        d->flags &= ~(__SEQUENTIAL_SECTION);
      gsize *= d->dim[dx].extent;
      ++dx;
    }
  }
  return gsize;
}

__ATTRIBUTES long long
pgf90_bogus_sect_la(F90_Desc_la *d, F90_Desc_la *a, long long flags, long long *lower, 
                 long long *upper, long long *stride)
{
  long long gsize;
  int ax, dx;
  d->flags |= __BOGUSBOUNDS;
  gsize = 1;
  dx = 0;
  for (ax = 0; ax < a->rank; ++ax) {
    if (flags & (1 << ax)) {
      d->dim[dx].lbound = lower[dx];
      d->dim[dx].ubound = upper[dx];
      d->dim[dx].extent = upper[dx] - lower[dx] + 1;
      d->dim[dx].lstride = stride[dx];
      if (d->dim[dx].lstride != gsize)
        d->flags &= ~(__SEQUENTIAL_SECTION);
      gsize *= d->dim[dx].extent;
      ++dx;
    }
  }
  return gsize;
}

__ATTRIBUTES void
pgf90_sect1v(void *pd, void *pa, void *prank /*ignored*/, int lw0, int up0,
             int st0, int flags)
{
  int gsize, dx, rank;
  F90_Desc *d = (F90_Desc *)pd, *a = (F90_Desc *)pa;
  rank = (flags & 0x1);
  pgf90_init_section(d, rank, a);
  gsize = 1;
  dx = 0;
  if (flags & BOGUSFLAG) {
    int lower[1], upper[1], stride[1];
    lower[0] = lw0;
    upper[0] = up0;
    stride[0] = st0;
    gsize = pgf90_bogus_sect(d, a, flags, lower, upper, stride);
  } else {
    int ax = 0;
    if (flags & __SECTZBASE) {
      d->lbase = 1;
      if (flags & (1 << ax)) {
        gsize = pgf90_tsection(d, dx, a, ax, lw0, up0, st0, gsize, flags);
        ++dx;
      }
    } else {
      if (flags & (1 << ax)) {
        gsize = pgf90_asection(d, dx, a, ax, lw0, up0, st0, gsize, flags);
        ++dx;
      }
    }
  }
  d->gsize = gsize;
  d->lsize = gsize;
}

__ATTRIBUTES void
pgf90_sect1(void *d, void *a, void *prank, void *lw0, void *up0, void *st0,
            void *flags)
{
  pgf90_sect1v(d, a, prank, *(int *)lw0, *(int *)up0, *(int *)st0,
               *(int *)flags);
}

__ATTRIBUTES void
pgf90_sect_alldim_i8(void *pd, void *pa, long long *lower, long long *upper,
                     long long *stride, long long flags, int rank, int wrank)
{
  long long gsize;
  int ax, dx;
  F90_Desc_la *d = (F90_Desc_la *)pd, *a = (F90_Desc_la *)pa;
  gsize = 1;
  pgf90_init_section_la(d, rank, a);

  if (flags & BOGUSFLAG) {
    gsize = pgf90_bogus_sect_la(d, a, flags, lower, upper, stride);
  } else {
    dx = 0;
    if (flags & __SECTZBASE) {
      d->lbase = 1;
      for (ax = 0; ax < wrank; ++ax) {
        if (flags & (1 << ax)) {
          gsize = pgf90_tsection_la(d, dx, a, ax, lower[ax], upper[ax], stride[ax],
                                    gsize, flags);
          ++dx;
        }
      }
    } else {
      for (ax = 0; ax < wrank; ++ax) {
        if (flags & (1 << ax)) {
          gsize = pgf90_asection_la(d, dx, a, ax, lower[ax], upper[ax], stride[ax],
                                    gsize, flags);
          ++dx;
        } else {
          pgf90_fort_set_single_la(d, a, ax, lower[ax]);
        }
      }
    }
  }
}

__ATTRIBUTES void
pgf90_sect1v_i8(void *d, void *a, void *prank /*ignored*/, long long lw0,
             long long up0, long long st0, long long flags)
{
  long long lower = lw0;
  long long upper = up0;
  long long stride = st0;
  int rank = (flags & 0x1);
  pgf90_sect_alldim_i8(d, a, &lower, &upper, &stride, flags, rank, 1);
}

__ATTRIBUTES void
pgf90_sect1_i8(void *d, void *a, void *prank, void *lw0, void *up0, void *st0,
            void *pflags)
{
  long long flags = *((long long *)pflags);
  int rank = (flags & 0x1);
  pgf90_sect_alldim_i8(d, a, (long long *)lw0, (long long *)up0, (long long *)st0,
                       flags, rank, 1);
}

__ATTRIBUTES void
pgf90_sect2v(void *pd, void *pa, void *prank /*ignored*/, int lw0, int up0,
             int st0, int lw1, int up1, int st1, int flags)
{
  int gsize, dx, rank;
  F90_Desc *d = (F90_Desc *)pd, *a = (F90_Desc *)pa;
  rank = (flags & 0x1) + (flags & 0x2);
  pgf90_init_section(d, rank, a);
  gsize = 1;
  dx = 0;
  if (flags & BOGUSFLAG) {
    int lower[2], upper[2], stride[2];
    lower[0] = lw0;
    upper[0] = up0;
    stride[0] = st0;
    lower[1] = lw1;
    upper[1] = up1;
    stride[1] = st1;
    gsize = pgf90_bogus_sect(d, a, flags, lower, upper, stride);
  } else {
    int ax = 0;
    if (flags & __SECTZBASE) {
      d->lbase = 1;
      if (flags & (1 << ax)) {
        gsize = pgf90_tsection(d, dx, a, ax, lw0, up0, st0, gsize, flags);
        ++dx;
      }
      ++ax;
      if (flags & (1 << ax)) {
        gsize = pgf90_tsection(d, dx, a, ax, lw1, up1, st1, gsize, flags);
        ++dx;
      }
    } else {
      if (flags & (1 << ax)) {
        gsize = pgf90_asection(d, 0, a, 0, lw0, up0, st0, gsize, flags);
        ++dx;
      }
      ++ax;
      if (flags & (1 << ax)) {
        gsize = pgf90_asection(d, dx, a, ax, lw1, up1, st1, gsize, flags);
        ++dx;
      }
    }
  }
  d->gsize = gsize;
  d->lsize = gsize;
}

__ATTRIBUTES void
pgf90_sect2(void *d, void *a, void *prank /*ignored*/, void *lw0, void *up0,
            void *st0, void *lw1, void *up1, void *st1, void *flags)
{
  pgf90_sect2v(d, a, prank, *(int *)lw0, *(int *)up0, *(int *)st0, *(int *)lw1,
               *(int *)up1, *(int *)st1, *(int *)flags);
}

__ATTRIBUTES void
pgf90_sect2v_i8(void *d, void *a, void *prank /*ignored*/, long long lw0, long long up0,
             long long st0, long long lw1, long long up1, long long st1, long long flags)
{
  long long lower[2], upper[2], stride[2];
  int rank = (flags & 0x1) + (flags & 0x2);
  lower[0] = lw0;
  upper[0] = up0;
  stride[0] = st0;
  lower[1] = lw1;
  upper[1] = up1;
  stride[1] = st1;
  pgf90_sect_alldim_i8(d, a, lower, upper, stride, flags, rank, 2);
}

__ATTRIBUTES void
pgf90_sect2_i8(void *d, void *a, void *prank /*ignored*/, void *lw0, void *up0,
            void *st0, void *lw1, void *up1, void *st1, void *pflags)
{
  long long lower[2], upper[2], stride[2];
  long long flags = *((long long *)pflags);
  int rank = (flags & 0x1) + (flags & 0x2);
  lower[0] = *(long long *)lw0;
  upper[0] = *(long long *)up0;
  stride[0] = *(long long *)st0;
  lower[1] = *(long long *)lw1;
  upper[1] = *(long long *)up1;
  stride[1] = *(long long *)st1;
  pgf90_sect_alldim_i8(d, a, lower, upper, stride, flags, rank, 2);
}

__ATTRIBUTES void
pgf90_sect3v(void *pd, void *pa, void *prank /*ignored*/, int lw0, int up0,
             int st0, int lw1, int up1, int st1, int lw2, int up2, int st2,
             int flags)
{
  int gsize, dx, rank;
  F90_Desc *d = (F90_Desc *)pd, *a = (F90_Desc *)pa;
  rank = (flags & 0x5) + ((flags >> 1) & 0x1);
  rank = (rank & 0x3) + ((rank >> 2) & 0x1);
  pgf90_init_section(d, rank, a);
  gsize = 1;
  dx = 0;
  if (flags & BOGUSFLAG) {
    int lower[3], upper[3], stride[3];
    lower[0] = lw0;
    upper[0] = up0;
    stride[0] = st0;
    lower[1] = lw1;
    upper[1] = up1;
    stride[1] = st1;
    lower[2] = lw2;
    upper[2] = up2;
    stride[2] = st2;
    gsize = pgf90_bogus_sect(d, a, flags, lower, upper, stride);
  } else {
    int ax = 0;
    if (flags & __SECTZBASE) {
      d->lbase = 1;
      if (flags & (1 << ax)) {
        gsize = pgf90_tsection(d, dx, a, ax, lw0, up0, st0, gsize, flags);
        ++dx;
      }
      ++ax;
      if (flags & (1 << ax)) {
        gsize = pgf90_tsection(d, dx, a, ax, lw1, up1, st1, gsize, flags);
        ++dx;
      }
      ++ax;
      if (flags & (1 << ax)) {
        gsize = pgf90_tsection(d, dx, a, ax, lw2, up2, st2, gsize, flags);
        ++dx;
      }
    } else {
      if (flags & (1 << ax)) {
        gsize = pgf90_asection(d, 0, a, 0, lw0, up0, st0, gsize, flags);
        ++dx;
      }
      ++ax;
      if (flags & (1 << ax)) {
        gsize = pgf90_asection(d, dx, a, ax, lw1, up1, st1, gsize, flags);
        ++dx;
      }
      ++ax;
      if (flags & (1 << ax)) {
        gsize = pgf90_asection(d, dx, a, ax, lw2, up2, st2, gsize, flags);
        ++dx;
      }
    }
  }
  d->gsize = gsize;
  d->lsize = gsize;
}

__ATTRIBUTES void
pgf90_sect3(void *d, void *a, void *prank /*ignored*/, void *lw0, void *up0,
            void *st0, void *lw1, void *up1, void *st1, void *lw2, void *up2,
            void *st2, void *flags)
{
  pgf90_sect3v(d, a, prank, *(int *)lw0, *(int *)up0, *(int *)st0, *(int *)lw1,
               *(int *)up1, *(int *)st1, *(int *)lw2, *(int *)up2, *(int *)st2,
               *(int *)flags);
}

__ATTRIBUTES void
pgf90_sect3v_i8(void *d, void *a, void *prank /*ignored*/, long long lw0, long long up0,
                long long st0, long long lw1, long long up1, long long st1, long long lw2,
                long long up2, long long st2, long long flags)
{
  long long lower[3], upper[3], stride[3];
  int rank;
  rank = (flags & 0x5) + ((flags >> 1) & 0x1);
  rank = (rank & 0x3) + ((rank >> 2) & 0x1);
  lower[0] = lw0;
  upper[0] = up0;
  stride[0] = st0;
  lower[1] = lw1;
  upper[1] = up1;
  stride[1] = st1;
  lower[2] = lw2;
  upper[2] = up2;
  stride[2] = st2;
  pgf90_sect_alldim_i8(d, a, lower, upper, stride, flags, rank, 3);
}

__ATTRIBUTES void
pgf90_sect3_i8(void *d, void *a, void *prank /*ignored*/, void *lw0, void *up0,
               void *st0, void *lw1, void *up1, void *st1, void *lw2, void *up2,
               void *st2, void *pflags)
{
  long long lower[3], upper[3], stride[3];
  long long flags = *((long long *)pflags);
  int rank;
  rank = (flags & 0x5) + ((flags >> 1) & 0x1);
  rank = (rank & 0x3) + ((rank >> 2) & 0x1);
  lower[0] = *(long long *)lw0;
  upper[0] = *(long long *)up0;
  stride[0] = *(long long *)st0;
  lower[1] = *(long long *)lw1;
  upper[1] = *(long long *)up1;
  stride[1] = *(long long *)st1;
  lower[2] = *(long long *)lw2;
  upper[2] = *(long long *)up2;
  stride[2] = *(long long *)st2;
  pgf90_sect_alldim_i8(d, a, lower, upper, stride, flags, rank, 3);
}

__ATTRIBUTES int
pgf90_templatexx(F90_Desc *d, int i, int lb, int ub, int *lbase, int gsize)
{
  int extent;
  extent = ub - lb + 1;
  if (extent < 0) {
    extent = 0;
    ub = lb - 1;
  }
  d->dim[i].lbound = lb;
  d->dim[i].ubound = ub;
  d->dim[i].extent = extent;
  d->dim[i].sstride = 1;
  d->dim[i].soffset = 0;
  d->dim[i].lstride = gsize;
  *lbase -= gsize * lb;
  gsize *= extent;
  return gsize;
}

__ATTRIBUTES void
pgf90_template1v(void *pd, int flags, int kind, int len, int l1, int u1)
{
  int gsize, lbase;
  F90_Desc *d = (F90_Desc *)pd;
  pgf90_init_descriptor(d, 1, kind, len, flags);
  gsize = lbase = 1;
  gsize = pgf90_templatexx(d, 0, l1, u1, &lbase, gsize);
  d->lbase = lbase;
  d->gsize = gsize;
  d->lsize = gsize;
}

__ATTRIBUTES void
pgf90_template1(void *d, void *flags, void *kind, void *len, void *l1, void *u1)
{
  pgf90_template1v(d, *(int *)flags, *(int *)kind, *(int *)len, *(int *)l1,
                   *(int *)u1);
}

__ATTRIBUTES int
pgf90_templatexx_la(F90_Desc_la *d, long long i, long long lb, long long ub, long long *lbase, long long gsize)
{
  int extent;
  extent = ub - lb + 1;
  if (extent < 0) {
    extent = 0;
    ub = lb - 1;
  }
  d->dim[i].lbound = lb;
  d->dim[i].ubound = ub;
  d->dim[i].extent = extent;
  d->dim[i].sstride = 1;
  d->dim[i].soffset = 0;
  d->dim[i].lstride = gsize;
  *lbase -= gsize * lb;
  gsize *= extent;
  return gsize;
}

__ATTRIBUTES void
pgf90_template1v_i8(void *pd, long long flags, long long kind, long long len, long long l1, long long u1)
{
  long long gsize, lbase;
  F90_Desc_la *d = (F90_Desc_la *)pd;
  pgf90_init_descriptor_la(d, 1, kind, len, flags);
  gsize = lbase = 1;
  gsize = pgf90_templatexx_la(d, 0, l1, u1, &lbase, gsize);
  d->lbase = lbase;
  d->gsize = gsize;
  d->lsize = gsize;
}

__ATTRIBUTES void
pgf90_template1_i8(void *d, void *flags, void *kind, void *len, void *l1, void *u1)
{
  pgf90_template1v_i8(d, *(long long *)flags, *(long long *)kind, *(long long *)len, *(long long *)l1,
                      *(long long *)u1);
}

__ATTRIBUTES void
pgf90_template2v(void *pd, int flags, int kind, int len, int l1, int u1, int l2,
                 int u2)
{
  int gsize, lbase;
  F90_Desc *d = (F90_Desc *)pd;
  pgf90_init_descriptor(d, 2, kind, len, flags);
  gsize = lbase = 1;
  gsize = pgf90_templatexx(d, 0, l1, u1, &lbase, gsize);
  gsize = pgf90_templatexx(d, 1, l2, u2, &lbase, gsize);
  d->lbase = lbase;
  d->gsize = gsize;
  d->lsize = gsize;
}

__ATTRIBUTES void
pgf90_template2(void *d, void *flags, void *kind, void *len, void *l1, void *u1,
                void *l2, void *u2)
{
  pgf90_template2v(d, *(int *)flags, *(int *)kind, *(int *)len, *(int *)l1,
                   *(int *)u1, *(int *)l2, *(int *)u2);
}

__ATTRIBUTES void
pgf90_template2v_i8(void *pd, long long flags, long long kind, long long len, long long l1, long long u1,
	                long long l2, long long u2)
{
  long long gsize, lbase;
  F90_Desc_la *d = (F90_Desc_la *)pd;
  pgf90_init_descriptor_la(d, 2, kind, len, flags);
  gsize = lbase = 1;
  gsize = pgf90_templatexx_la(d, 0, l1, u1, &lbase, gsize);
  gsize = pgf90_templatexx_la(d, 1, l2, u2, &lbase, gsize);
  d->lbase = lbase;
  d->gsize = gsize;
  d->lsize = gsize;
}

__ATTRIBUTES void
pgf90_template2_i8(void *d, void *flags, void *kind, void *len, void *l1, void *u1,
                void *l2, void *u2)
{
  pgf90_template2v_i8(d, *(long long *)flags, *(long long *)kind, *(long long *)len, *(long long *)l1,
                   *(long long *)u1, *(long long *)l2, *(long long *)u2);
}

__ATTRIBUTES void
pgf90_template3v(void *pd, int flags, int kind, int len, int l1, int u1, int l2,
                 int u2, int l3, int u3)
{
  int gsize, lbase;
  F90_Desc *d = (F90_Desc *)pd;
  pgf90_init_descriptor(d, 3, kind, len, flags);
  gsize = lbase = 1;
  gsize = pgf90_templatexx(d, 0, l1, u1, &lbase, gsize);
  gsize = pgf90_templatexx(d, 1, l2, u2, &lbase, gsize);
  gsize = pgf90_templatexx(d, 2, l3, u3, &lbase, gsize);
  d->lbase = lbase;
  d->gsize = gsize;
  d->lsize = gsize;
}

__ATTRIBUTES void
pgf90_template3(void *d, void *flags, void *kind, void *len, void *l1, void *u1,
                void *l2, void *u2, void *l3, void *u3)
{
  pgf90_template3v(d, *(int *)flags, *(int *)kind, *(int *)len, *(int *)l1,
                   *(int *)u1, *(int *)l2, *(int *)u2, *(int *)l3, *(int *)u3);
}

__ATTRIBUTES void
pgf90_template3v_i8(void *pd, long long flags, long long kind, long long len, long long l1, long long u1,
	                long long l2, long long u2, long long l3, long long u3)
{
  long long gsize, lbase;
  F90_Desc_la *d = (F90_Desc_la *)pd;
  pgf90_init_descriptor_la(d, 3, kind, len, flags);
  gsize = lbase = 1;
  gsize = pgf90_templatexx_la(d, 0, l1, u1, &lbase, gsize);
  gsize = pgf90_templatexx_la(d, 1, l2, u2, &lbase, gsize);
  gsize = pgf90_templatexx_la(d, 2, l3, u3, &lbase, gsize);
  d->lbase = lbase;
  d->gsize = gsize;
  d->lsize = gsize;
}

__ATTRIBUTES void
pgf90_template3_i8(void *d, void *flags, void *kind, void *len, void *l1, void *u1,
                void *l2, void *u2, void *l3, void *u3)
{
  pgf90_template3v_i8(d, *(long long *)flags, *(long long *)kind, *(long long *)len, *(long long *)l1,
                   *(long long *)u1, *(long long *)l2, *(long long *)u2, *(long long *)l3, *(long long *)u3);
}

__ATTRIBUTES double
__pgi_spacingd(double f)
{
  long long e, i;
  i = __double_as_longlong(f);
  e = ((i >> 52) & 0x7ffll) - 52;
  if (e < 1)
    e = 1;
  i = e << 52;
  return __longlong_as_double(i);
}

#if !defined(PGI_COMPILE_BITCODE)
__ATTRIBUTES int
__pgi_atomicAddi(void *address, int val)
{
  return atomicAdd((int *)address, val);
}

__ATTRIBUTES unsigned int
__pgi_atomicAddu(void *address, unsigned int val)
{
  return atomicAdd((unsigned int *)address, val);
}

__ATTRIBUTES float
__pgi_atomicAddf(void *address, float val)
{
  return atomicAdd((float *)address, val);
}

__ATTRIBUTES unsigned long long
__pgi_atomicAddul(void *address, unsigned long long val)
{
  return atomicAdd((unsigned long long *)address, val);
}

__ATTRIBUTES long long
__pgi_atomicAddil(void *address, long long val)
{
  return (long long)__pgi_atomicAddul(address, (unsigned long long)val);
}

__ATTRIBUTES double
__pgi_atomicAddd(void *address, double val)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
  return atomicAdd((double *)address, val);
#else /* !(defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) */
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed,
                    __double_as_longlong(val + __longlong_as_double(assumed)));
  } while (assumed != old);
  return __longlong_as_double(old);
#endif /* !(defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) */
}

__ATTRIBUTES int
__pgi_atomicSubi(void *address, int val)
{
  return atomicSub((int *)address, val);
}

__ATTRIBUTES int
__pgi_atomicSubir(void *address, int val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (val - assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicSubu(void *address, unsigned int val)
{
  return atomicSub((unsigned int *)address, val);
}

__ATTRIBUTES unsigned int
__pgi_atomicSubur(void *address, unsigned int val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)(int)(val - assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES void
__pgi_atomicLoadi(void *to_address, void *from_address)
{
  int *int_to_address = (int *)to_address;
  int r;
  r = atomicAdd((int *)from_address, 0);
  *int_to_address = r;
}

__ATTRIBUTES float
__pgi_atomicSubf(void *address, float val)
{
  return atomicAdd((float *)address, -val);
}

__ATTRIBUTES float
__pgi_atomicSubfr(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_int, assumed,
        __float_as_int( val - __int_as_float(assumed) ));
  } while (assumed != old);
  return __int_as_float(old);
}

__ATTRIBUTES void
__pgi_atomicLoadf(void *to_address, void *from_address)
{
  float *float_to_address = (float *)to_address;
  float r;
  r = atomicAdd((float *)from_address, 0.0f);
  *float_to_address = r;
}

__ATTRIBUTES unsigned long long
__pgi_atomicSubul(void *address, unsigned long long val)
{
  long long int nval = (long long int)val;
  nval = -nval;
  return atomicAdd((unsigned long long *)address,
                   *((unsigned long long int *)&nval));
}

__ATTRIBUTES unsigned long long
__pgi_atomicSubulr(void *address, unsigned long long val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_ull, (unsigned long long int)assumed,
        (unsigned long long int)(long long int)(assumed - val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long
__pgi_atomicSubil(void *address, long long val)
{
  long long int nval = (long long int)val;
  nval = -nval;
  return __pgi_atomicAddil(address, nval);
}

__ATTRIBUTES long long
__pgi_atomicSubilr(void *address, long long val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_ull, (unsigned long long int)assumed,
        (unsigned long long int)(long long int)(val - assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES void
__pgi_atomicLoadl(void *to_address, void *from_address)
{
  unsigned long long *ll_to_address = (unsigned long long *)to_address;
  unsigned long long r;
  r = atomicAdd((unsigned long long *)from_address, 0);
  *ll_to_address = r;
}

__ATTRIBUTES double
__pgi_atomicSubd(void *address, double val)
{
  return __pgi_atomicAddd(address, -val);
}

__ATTRIBUTES void
__pgi_atomicLoadd(void *to_address, void *from_address)
{
  double *double_to_address = (double *)to_address;
  double r;
  r = __pgi_atomicAddd(from_address, 0.0);
  *double_to_address = r;
}


__ATTRIBUTES int
__pgi_atomicExchi(void *address, int val)
{
  return atomicExch((int *)address, val);
}

__ATTRIBUTES unsigned int
__pgi_atomicExchu(void *address, unsigned int val)
{
  return atomicExch((unsigned int *)address, val);
}

__ATTRIBUTES float
__pgi_atomicExchf(void *address, float val)
{
  return atomicExch((float *)address, val);
}

__ATTRIBUTES  long long
__pgi_atomicExchil(void *address, long long val)
{
  return atomicExch((unsigned long long *)address, (unsigned long long)val);
}
__ATTRIBUTES unsigned long long
__pgi_atomicExchul(void *address, unsigned long long val)
{
  return (long long) atomicExch((unsigned long long *)address, val);
}

__ATTRIBUTES double
__pgi_atomicExchd(void *address, double val)
{
  unsigned long long int old;
  old = atomicExch((unsigned long long *)address, __double_as_longlong(val));
  return __longlong_as_double(old);
}

__ATTRIBUTES int
__pgi_atomicMini(void *address, int val)
{
  return atomicMin((int *)address, val);
}

__ATTRIBUTES unsigned int
__pgi_atomicMinu(void *address, unsigned int val)
{
  return atomicMin((unsigned int *)address, val);
}

__ATTRIBUTES float
__pgi_atomicMinf(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;
  if (val < __int_as_float(old)) {
    do {
      assumed = old;
      old = atomicCAS(address_as_int, assumed, __float_as_int(val));
    } while ((assumed != old) && (val < __int_as_float(old)));
  }
  return __int_as_float(old);
}

__ATTRIBUTES unsigned long long
__pgi_atomicMinul(void *address, unsigned long long val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val < old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, val);
    } while ((assumed != old) && (val < old));
  }
  return old;
}

__ATTRIBUTES long long
__pgi_atomicMinil(void *address, long long val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *address_as_ull, assumed;
  if (val < old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, (unsigned long long)assumed,
                      (unsigned long long)val);
    } while ((assumed != old) && (val < old));
  }
  return old;
}

__ATTRIBUTES unsigned long long
__pgi_atomicMulul(void *address, unsigned long long val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  unsigned long long old = *address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, assumed * val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long
__pgi_atomicMulil(void *address, long long val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  unsigned long long old = *address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, (unsigned long long)((long long)assumed * val) );
  } while (assumed != old);
  return (long long)old;
}

__ATTRIBUTES unsigned long long
__pgi_atomicAndul(void *address, unsigned long long val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  unsigned long long old = *address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, assumed & val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long
__pgi_atomicAndil(void *address, long long val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  long long old = *(long long *)address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long)assumed, assumed & val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long
__pgi_atomicOrul(void *address, unsigned long long val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  unsigned long long old = *address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, assumed | val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long
__pgi_atomicOril(void *address, long long val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  long long old = *(long long *)address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long)assumed, assumed | val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long
__pgi_atomicXorul(void *address, unsigned long long val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  unsigned long long old = *address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, assumed ^ val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long
__pgi_atomicXoril(void *address, long long val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  long long old = *(long long *)address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long)assumed, assumed ^ val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long
__pgi_atomicDivul(void *address, unsigned long long val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  unsigned long long old = *address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, assumed / val);
  } while (assumed != old);
  return old;
}
__ATTRIBUTES long long
__pgi_atomicDivil(void *address, long long val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  unsigned long long old = *address_as_ull, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, (unsigned long long)((long long)assumed / val) );
  } while (assumed != old);
  return (long long)old;
}

__ATTRIBUTES double
__pgi_atomicMind(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val < __longlong_as_double(old)) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
    } while ((assumed != old) && (val < __longlong_as_double(old)));
  }
  return __longlong_as_double(old);
}

__ATTRIBUTES int
__pgi_atomicMuli(void *address, int val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, assumed * val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicMulu(void *address, unsigned int val)
{
  unsigned int *address_as_int = (unsigned int *)address;
  unsigned int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, assumed * val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES float
__pgi_atomicMulf(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed,
                    __float_as_int(__int_as_float(assumed) * val));
  } while (assumed != old);
  return __int_as_float(old);
}

__ATTRIBUTES double
__pgi_atomicMuld(void *address, double val)
{
  unsigned long long *address_as_ull = (unsigned long long *)address;
  unsigned long long old = *address_as_ull, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed,
                    __double_as_longlong(__longlong_as_double(assumed) * val));
  } while (assumed != old);
  return __longlong_as_double(old);
}

__ATTRIBUTES unsigned int
__pgi_atomicRShiftu(void *address, unsigned int val)
{
  unsigned int *address_as_int = (unsigned int *)address;
  unsigned int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, assumed >> val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicRShiftur(void *address, unsigned int val)
{
  unsigned int *address_as_int = (unsigned int *)address;
  unsigned int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, val >> assumed);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicLShiftu(void *address, unsigned int val)
{
  unsigned int *address_as_int = (unsigned int *)address;
  unsigned int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, assumed << val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicLShiftur(void *address, unsigned int val)
{
  unsigned int *address_as_int = (unsigned int *)address;
  unsigned int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, val << assumed);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicRShifti(void *address, int val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, assumed >> val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicRShiftir(void *address, int val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, val >> assumed);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicLShifti(void *address, int val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, assumed << val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicLShiftir(void *address, int val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, val << assumed);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicNoti(void *address)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, ~assumed);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicMaxi(void *address, int val)
{
  return atomicMax((int *)address, val);
}

__ATTRIBUTES unsigned int
__pgi_atomicMaxu(void *address, unsigned int val)
{
  return atomicMax((unsigned int *)address, val);
}

__ATTRIBUTES float
__pgi_atomicMaxf(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;
  if (val > __int_as_float(old)) {
    do {
      assumed = old;
      old = atomicCAS(address_as_int, assumed, __float_as_int(val));
    } while ((assumed != old) && (val > __int_as_float(old)));
  }
  return __int_as_float(old);
}

__ATTRIBUTES unsigned long long
__pgi_atomicMaxul(void *address, unsigned long long val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val > old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, val);
    } while ((assumed != old) && (val > old));
  }
  return old;
}

__ATTRIBUTES long long
__pgi_atomicMaxil(void *address, long long val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *address_as_ull, assumed;
  if (val > old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, (unsigned long long)assumed,
                      (unsigned long long)val);
    } while ((assumed != old) && (val > old));
  }
  return old;
}

__ATTRIBUTES double
__pgi_atomicMaxd(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull, assumed;
  if (val > __longlong_as_double(old)) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
    } while ((assumed != old) && (val > __longlong_as_double(old)));
  }
  return __longlong_as_double(old);
}

__ATTRIBUTES unsigned int
__pgi_atomicIncu(void *address, unsigned int val)
{
  return atomicInc((unsigned int *)address, val);
}

__ATTRIBUTES unsigned int
__pgi_atomicDecu(void *address, unsigned int val)
{
  return atomicDec((unsigned int *)address, val);
}

__ATTRIBUTES int
__pgi_atomicCASi(void *address, int val, int val2)
{
  return atomicCAS((int *)address, val, val2);
}

__ATTRIBUTES unsigned int
__pgi_atomicCASu(void *address, unsigned int val, unsigned int val2)
{
  return atomicCAS((unsigned int *)address, val, val2);
}

__ATTRIBUTES float
__pgi_atomicCASf(void *address, float val, float val2)
{
  int ival, ires;
  /* if val is a NaN, compare will always fail */
  ival = __float_as_int(val);
  if ((ival & 0x7fffffff) <= 0x7f800000) {
    ires = atomicCAS((int *)address, ival, __float_as_int(val2));
  } else {
    ires = *((int *)address);
  }
  return __int_as_float(ires);
}

__ATTRIBUTES unsigned long long
__pgi_atomicCASul(void *address, unsigned long long val,
                  unsigned long long val2)
{
  return atomicCAS((unsigned long long *)address, val, val2);
}

__ATTRIBUTES long long
__pgi_atomicCASil(void *address, long long val, long long val2)
{
  unsigned long long r, uval, uval2;
  uval = (unsigned long long)val;
  uval2 = (unsigned long long)val2;
  r = atomicCAS((unsigned long long *)address, *(unsigned long long *)&uval,
                *(unsigned long long *)&uval2);
  return (long long)r;
}

__ATTRIBUTES double
__pgi_atomicCASd(void *address, double val, double val2)
{
  unsigned long long int ival, ires;
  /* if val is a NaN, compare will always fail */
  ival = __double_as_longlong(val);
  if ((ival & 0x7fffffffffffffff) <= 0x7ff0000000000000) {
    ires = atomicCAS((unsigned long long *)address, ival,
                     __double_as_longlong(val2));
  } else {
    ires = *((unsigned long long *)address);
  }
  return __longlong_as_double(ires);
}

__ATTRIBUTES int
__pgi_atomicCASid(void *address, int ival1, double val2)
{
  int ival2, ires;
  ival2 = (int)val2;
  ires = atomicCAS((int *)address, ival1, ival2);
  return ires;
}

__ATTRIBUTES int
__pgi_atomicAndi(void *address, int val)
{
  return atomicAnd((int *)address, val);
}

__ATTRIBUTES unsigned int
__pgi_atomicAndu(void *address, unsigned int val)
{
  return atomicAnd((unsigned int *)address, val);
}

__ATTRIBUTES int
__pgi_atomicOri(void *address, int val)
{
  return atomicOr((int *)address, val);
}

__ATTRIBUTES unsigned int
__pgi_atomicOru(void *address, unsigned int val)
{
  return atomicOr((unsigned int *)address, val);
}

__ATTRIBUTES int
__pgi_atomicXori(void *address, int val)
{
  return atomicXor((int *)address, val);
}

__ATTRIBUTES unsigned int
__pgi_atomicXoru(void *address, unsigned int val)
{
  return atomicXor((unsigned int *)address, val);
}

__ATTRIBUTES int
__pgi_atomicDivi(void *address, int val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, assumed / val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicDivu(void *address, unsigned int val)
{
  unsigned int *address_as_int = (unsigned int *)address;
  unsigned int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, assumed / val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES float
__pgi_atomicDivf(void *address, float val)
{
  float *address_as_float = (float *)address;
  float old = *address_as_float, assumed;

  do {
    assumed = old;
    old = __pgi_atomicCASf(address_as_float, assumed, assumed / val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES float
__pgi_atomicDivfr(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_int, assumed,
        __float_as_int( val / __int_as_float(assumed) ));
  } while (assumed != old);
  return __int_as_float(old);
}

__ATTRIBUTES double
__pgi_atomicDivd(void *address, double val)
{
  double *address_as_double = (double *)address;
  double old = *address_as_double, assumed;

  do {
    assumed = old;
    old = __pgi_atomicCASd(address_as_double, assumed, assumed / val);
  } while (assumed != old);
  return old;
}

__ATTRIBUTES double
__pgi_atomicDivdr(void *address, double val)
{
  double *address_as_double = (double *)address;
  double old = *address_as_double, assumed;

  do {
    assumed = old;
    old = __pgi_atomicCASd(address_as_double, assumed, val / assumed );
  } while (assumed != old);
  return old;
}

__ATTRIBUTES double
__pgi_atomicSubdr(void *address, double val)
{
  double *address_as_double = (double *)address;
  double old = *address_as_double, assumed;

  do {
    assumed = old;
    old = __pgi_atomicCASd(address_as_double, assumed, val - assumed );
  } while (assumed != old);
  return old;
}


// Add Operation
__ATTRIBUTES int
__pgi_atomicAddif(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)((float)assumed + val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicAddid(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)((double)assumed + val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicAddkf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((float)assumed + val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicAddkd(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((double)assumed + val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicAdduf(void *address, float val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)((float)assumed + val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicAddud(void *address, double val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)((double)assumed + val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicAddlf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((float)assumed + val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicAddld(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((double)assumed + val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES float
__pgi_atomicAddfd(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_int, assumed,
        __float_as_int((float)((double)__int_as_float(assumed) + val)));
  } while (assumed != old);
  return __int_as_float(old);
}

// Sub Operation
__ATTRIBUTES int
__pgi_atomicSubif(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)((float)assumed - val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicSubifr(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)(val - (float)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicSubid(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)((double)assumed - val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicSubidr(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)((double)val - assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicSubkf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((float)assumed - val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicSubkfr(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)(val - (float)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicSubkd(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_ull, (unsigned long long int)assumed,
        (unsigned long long int)(long long int)((double)assumed - val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicSubkdr(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_ull, (unsigned long long int)assumed,
        (unsigned long long int)(long long int)(val - (double)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicSubuf(void *address, float val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)(int)((float)assumed - val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicSubufr(void *address, float val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)(int)(val - (float)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicSubud(void *address, double val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)(int)((double)assumed - val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicSubudr(void *address, double val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)(int)(val - (double)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicSublf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)(int)((float)assumed - val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicSublfr(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)(int)(val - (float)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicSubld(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_ull, (unsigned long long int)assumed,
        (unsigned long long int)(long long int)((double)assumed - val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicSubldr(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_ull, (unsigned long long int)assumed,
        (unsigned long long int)(long long int)(val - (double)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES float
__pgi_atomicSubfd(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_int, assumed,
        __float_as_int((float)((double)__int_as_float(assumed) - val)));
  } while (assumed != old);
  return __int_as_float(old);
}

__ATTRIBUTES float
__pgi_atomicSubfdr(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_int, assumed,
        __float_as_int((float)(val - (double)__int_as_float(assumed))));
  } while (assumed != old);
  return __int_as_float(old);
}

// Div Operation
__ATTRIBUTES int
__pgi_atomicDivif(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)((float)assumed / val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicDivifr(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)(val / (float)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicDivid(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)((double)assumed / val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicDividr(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)((double)val / assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicDivkf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((float)assumed / val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicDivkfr(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)(val / (float)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicDivkd(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((double)assumed / val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicDivkdr(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)(val / (double)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicDivuf(void *address, float val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)((float)assumed / val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicDivufr(void *address, float val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)(val / (float)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicDivud(void *address, double val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)((double)assumed / val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicDivudr(void *address, double val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)(val / (double)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicDivlf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((float)assumed / val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicDivlfr(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)(val / (float)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicDivld(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((double)assumed / val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicDivldr(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)(val / (double)assumed));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES float
__pgi_atomicDivfd(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_int, assumed,
        __float_as_int((float)((double)__int_as_float(assumed) / val)));
  } while (assumed != old);
  return __int_as_float(old);
}

__ATTRIBUTES float
__pgi_atomicDivfdr(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_int, assumed,
        __float_as_int((float)(val / (double)__int_as_float(assumed))));
  } while (assumed != old);
  return __int_as_float(old);
}
// Mul Operation
__ATTRIBUTES int
__pgi_atomicMulif(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)((float)assumed * val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES int
__pgi_atomicMulid(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, (int)((double)assumed * val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicMulkf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((float)assumed * val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicMulkd(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((double)assumed * val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicMuluf(void *address, float val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)((float)assumed * val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicMulud(void *address, double val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_uint, assumed,
                    (unsigned int)((double)assumed * val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicMullf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((float)assumed * val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicMulld(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                    (unsigned long long int)((double)assumed * val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES float
__pgi_atomicMulfd(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  do {
    assumed = old;
    old = atomicCAS(
        address_as_int, assumed,
        __float_as_int((float)((double)__int_as_float(assumed) * val)));
  } while (assumed != old);
  return __int_as_float(old);
}

// Max Operation
__ATTRIBUTES int
__pgi_atomicMaxif(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  if (val > (float)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_int, assumed, (int)val);
    } while ((assumed != old) && (val > (float)old));
  }
  return old;
}

__ATTRIBUTES int
__pgi_atomicMaxid(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  if (val > (double)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_int, assumed, (int)val);
    } while ((assumed != old) && (val > (double)old));
  }
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicMaxkf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  if (val > (float)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                      (unsigned long long int)val);
    } while ((assumed != old) && (val > (float)old));
  }
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicMaxkd(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  if (val > (double)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                      (unsigned long long int)val);
    } while ((assumed != old) && (val > (double)old));
  }

  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicMaxuf(void *address, float val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  if (val > (float)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_uint, assumed, (unsigned int)val);
    } while ((assumed != old) && (val > (float)old));
  }
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicMaxud(void *address, double val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  if (val > (double)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_uint, assumed, (unsigned int)val);
    } while ((assumed != old) && (val > (double)old));
  }
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicMaxlf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  if (val > (float)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, (unsigned long long int)val);
    } while ((assumed != old) && (val > (float)old));
  }
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicMaxld(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;
  if (val > (double)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, (unsigned long long int)val);
    } while ((assumed != old) && (val > (double)old));
  }
  return old;
}

__ATTRIBUTES float
__pgi_atomicMaxfd(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  if (val > (double)__int_as_float(old)) {
    do {
      assumed = old;
      old = atomicCAS(address_as_int, assumed, __float_as_int((float)val));
    } while ((assumed != old) && (val > (double)__int_as_float(old)));
  }
  return __int_as_float(old);
}

// Min Operation
__ATTRIBUTES int
__pgi_atomicMinif(void *address, float val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  if (val < (float)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_int, assumed, (int)val);
    } while ((assumed != old) && (val < (float)old));
  }
  return old;
}

__ATTRIBUTES int
__pgi_atomicMinid(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  if (val < (double)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_int, assumed, (int)val);
    } while ((assumed != old) && (val < (double)old));
  }
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicMinkf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  if (val < (float)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                      (unsigned long long int)val);
    } while ((assumed != old) && (val < (float)old));
  }
  return old;
}

__ATTRIBUTES long long int
__pgi_atomicMinkd(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  long long int old = *((long long int *)address_as_ull);
  long long int assumed;

  if (val < (double)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, (unsigned long long int)assumed,
                      (unsigned long long int)val);
    } while ((assumed != old) && (val < (double)old));
  }

  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicMinuf(void *address, float val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  if (val < (float)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_uint, assumed, (unsigned int)val);
    } while ((assumed != old) && (val < (float)old));
  }
  return old;
}

__ATTRIBUTES unsigned int
__pgi_atomicMinud(void *address, double val)
{
  unsigned int *address_as_uint = (unsigned int *)address;
  unsigned int old = *address_as_uint;
  unsigned int assumed;

  if (val < (double)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_uint, assumed, (unsigned int)val);
    } while ((assumed != old) && (val < (double)old));
  }
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicMinlf(void *address, float val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

  if (val < (float)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, (unsigned long long int)val);
    } while ((assumed != old) && (val < (float)old));
  }
  return old;
}

__ATTRIBUTES unsigned long long int
__pgi_atomicMinld(void *address, double val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;
  if (val < (double)old) {
    do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, (unsigned long long int)val);
    } while ((assumed != old) && (val < (double)old));
  }
  return old;
}

__ATTRIBUTES float
__pgi_atomicMinfd(void *address, double val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int;
  int assumed;

  if (val < (double)__int_as_float(old)) {
    do {
      assumed = old;
      old = atomicCAS(address_as_int, assumed, __float_as_int((float)val));
    } while ((assumed != old) && (val < (double)__int_as_float(old)));
  }
  return __int_as_float(old);
}
#endif /* !defined(PGI_COMPILE_BITCODE) */

#if !defined(PGI_COMPILE_BITCODE)
__ATTRIBUTES int
__pgi_atomicLeqvi(void *address, int val)
{
  int *address_as_int = (int *)address;
  int old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, ~(assumed ^ val));
  } while (assumed != old);
  return old;
}

__ATTRIBUTES long long
__pgi_atomicLeqvil(void *address, long long val)
{
  unsigned long long *address_as_int = (unsigned long long *)address;
  unsigned long long old = *address_as_int, assumed;

  do {
    assumed = old;
    old = atomicCAS(address_as_int, assumed, ~(assumed ^ val));
  } while (assumed != old);
  return old;
}
#endif /* !defined(PGI_COMPILE_BITCODE) */

extern void *memcpy(void *dest, const void *src, size_t n);
extern void *memset(void *s, int c, size_t n);

/* Vanilla memcpy */
__ATTRIBUTES void
__pgi_memcpy(signed char *dst, signed char *src, size_t num_bytes)
{
  size_t i;
  for (i = 0; i < num_bytes; i++) {
    dst[i] = src[i];
  }
}

/* LLVM memcpy */
__ATTRIBUTES void
__llvm_memcpy(signed char *dest, signed char *src, size_t len, char isvolatile)
{
  memcpy(dest, src, len);
}

__ATTRIBUTES void
__llvm_memset(signed char *dest, char val, size_t len, char isvolatile)
{
  memset(dest, val, len);
}


/* LLVM memmove - similar to LLVM memcpy but allows overlap */
__ATTRIBUTES void
__llvm_memmove(signed char *dest, signed char *src, size_t len, char isvolatile)
{
  // Instead of using memcpy, use the pgi_memcpy which copies one byte at a
  // time and thus allows overlap.
  __pgi_memcpy(dest, src, len);
}

/* Vanilla memcmp */
__ATTRIBUTES int
__pgi_memcmp(signed char *s1, signed char *s2, size_t num_bytes)
{
  size_t i;
  for (i = 0; i < num_bytes; i++) {
    if (s1[i] < s2[i])
      return -1;
    if (s1[i] > s2[i])
      return 1;
  }
  return 0;
}

/* Vanilla memset */
__ATTRIBUTES void
__pgi_memset(signed char *dst, signed char src, size_t num_bytes)
{
  size_t i;
  for (i = 0; i < num_bytes; i++) {
    dst[i] = src;
  }
}

/* Keep these even in the presence of pghpf_transfer for */
/* performance reasons, simpler, fewer registers, etc.   */

__ATTRIBUTES int
__pgi_transfer_real2int(float x, int i)
{
  return __float_as_int(x);
}

__ATTRIBUTES float
__pgi_transfer_int2real(int i, float x)
{
  return __int_as_float(i);
}

__ATTRIBUTES long long
__pgi_transfer_dbl2long(double x, long long i)
{
  return __double_as_longlong(x);
}

__ATTRIBUTES double
__pgi_transfer_long2dbl(long long i, double x)
{
  return __longlong_as_double(i);
}

__ATTRIBUTES int
__pgi_transfer_char2int(signed char *x, int i, size_t n)
{
  return *((int *)x);
}

__ATTRIBUTES void
__pgi_transfer_int2char(signed char *x, signed char *y, signed char *z,
                        size_t len1, size_t len2)
{
  __pgi_memcpy(x, y, len1);
  return;
}

__ATTRIBUTES long long
__pgi_transfer_cdevptr(signed char *x, long long i)
{
  return *((long long *)x);
}


__ATTRIBUTES void
pghpf_transfer(signed char *rb, signed char *sb, signed char *rs,
               signed char *ss, signed char *rd, signed char *sd,
               signed char *rsd, signed char *msd)
{
  int result_scalar, source_scalar;
  int i, j, size, rsize, ssize, extent;
  int contiguous, offset;
  int lbi, extenti, stridei, ioff;
  int *cntr;

  struct F90_Desc *result = (struct F90_Desc *)rd;
  struct F90_Desc *source = (struct F90_Desc *)sd;

  result_scalar = result->tag != 35;
  source_scalar = source->tag != 35;
  rsize = *((int *)rs);
  ssize = *((int *)ss);

  if (result_scalar && source_scalar) {
    if (rsize > ssize) {
      rsize = ssize;
    }
    __pgi_memcpy(rb, sb, rsize);
    return;
  } else if (source_scalar) {
    extent = result->dim[0].extent;
    if (extent < 0) {
      extent = 0;
    }
    rsize *= extent;
    while (ssize > 0 && rsize > 0) {
      size = rsize;
      if (size > ssize) {
        size = ssize;
      }
      __pgi_memcpy(rb, sb, size);
      rb += size;
      sb += size;
      ssize -= size;
      rsize -= size;
    }
    return;
  }

  extent = 1; /* The next loop computes contiguous extent */
  contiguous = 1;
  offset = source->lbase;

  /* There's some unused space in the result descriptor we can borrow */
  cntr = ((int *)rd) + 6;

  /* Walk through the source descriptor */
  for (i = 0; i < source->rank; i++) {
    cntr[i] = 0;
    lbi = source->dim[i].lbound;
    extenti = source->dim[i].extent;
    stridei = source->dim[i].lstride;
    if (i == 0) {
      if (stridei != extent) {
        contiguous = 0;
        offset += stridei - lbi;
      } else {
        offset += lbi - 1;
      }
      if (stridei >= 0)
        extent = (extenti - 1) * stridei + extent;
      if (stridei < 0)
        extent = (extenti - 1) * stridei - extent;
    } else {
      if (stridei != extent) {
        contiguous = 0;
      }
      offset += lbi * stridei;
      ioff = (extenti - 1) * stridei;
      if (ioff < 0)
        ioff = -ioff;
      if (extent >= 0)
        extent += ioff;
      if (extent < 0)
        extent -= ioff;
    }
    ssize *= extenti; /* This will become number of bytes of data
                         for the non-contiguous case */
  }

  if (contiguous) {
    ssize = *ss;
    __pgi_memcpy(rb, sb + ssize * offset, ssize * extent);
  } else {
    extent = result->dim[0].extent;
    if (extent < 0) {
      extent = 0;
    }
    rsize *= extent;
    while (ssize > 0 && rsize > 0) {
      ioff = 0;
      for (i = source->rank - 1; i >= 0; i--) {
        extenti = source->dim[i].extent;
        stridei = source->dim[i].lstride;
        ioff += cntr[i] * stridei;
        if (i == 0) {
          /* Update for next time */
          j = 0;
          cntr[j]++;
          while (cntr[j] == extenti) {
            cntr[j++] = 0;
            if (j == source->rank)
              break;
            cntr[j]++;
            extenti = source->dim[j].extent;
          }
        }
      }
      size = rsize;
      if (size > *ss) {
        size = *ss;
      }
      __pgi_memcpy(rb, sb + (*ss) * (offset + ioff), size);
      rb += size;
      ssize -= size;
      rsize -= size;
    }
  }
  return;
}

__ATTRIBUTES void
pghpf_transfer_i8(signed char *rb, signed char *sb, signed char *rs,
                  signed char *ss, signed char *rd, signed char *sd,
                  signed char *rsd, signed char *msd)
{
  int result_scalar, source_scalar;
  int i, j, size, rsize, ssize, extent;
  int contiguous, offset;
  int lbi, extenti, stridei, ioff;
  int *cntr;

  struct F90_Desc_la *result = (struct F90_Desc_la *)rd;
  struct F90_Desc_la *source = (struct F90_Desc_la *)sd;

  result_scalar = result->tag != 35;
  source_scalar = source->tag != 35;
  rsize = *((int *)rs);
  ssize = *((int *)ss);

  if (result_scalar && source_scalar) {
    if (rsize > ssize) {
      rsize = ssize;
    }
    __pgi_memcpy(rb, sb, rsize);
    return;
  } else if (source_scalar) {
    extent = result->dim[0].extent;
    if (extent < 0) {
      extent = 0;
    }
    rsize *= extent;
    while (ssize > 0 && rsize > 0) {
      size = rsize;
      if (size > ssize) {
        size = ssize;
      }
      __pgi_memcpy(rb, sb, size);
      rb += size;
      sb += size;
      ssize -= size;
      rsize -= size;
    }
    return;
  }

  extent = 1; /* The next loop computes contiguous extent */
  contiguous = 1;
  offset = source->lbase;

  /* There's some unused space in the result descriptor we can borrow */
  cntr = ((int *)rd) + 6;

  /* Walk through the source descriptor */
  for (i = 0; i < source->rank; i++) {
    cntr[i] = 0;
    lbi = source->dim[i].lbound;
    extenti = source->dim[i].extent;
    stridei = source->dim[i].lstride;
    if (i == 0) {
      if (stridei != extent) {
        contiguous = 0;
        offset += stridei - lbi;
      } else {
        offset += lbi - 1;
      }
      if (stridei >= 0)
        extent = (extenti - 1) * stridei + extent;
      if (stridei < 0)
        extent = (extenti - 1) * stridei - extent;
    } else {
      if (stridei != extent) {
        contiguous = 0;
      }
      offset += lbi * stridei;
      ioff = (extenti - 1) * stridei;
      if (ioff < 0)
        ioff = -ioff;
      if (extent >= 0)
        extent += ioff;
      if (extent < 0)
        extent -= ioff;
    }
    ssize *= extenti; /* This will become number of bytes of data
                         for the non-contiguous case */
  }

  if (contiguous) {
    ssize = *ss;
    __pgi_memcpy(rb, sb + ssize * offset, ssize * extent);
  } else {
    extent = result->dim[0].extent;
    if (extent < 0) {
      extent = 0;
    }
    rsize *= extent;
    while (ssize > 0 && rsize > 0) {
      ioff = 0;
      for (i = source->rank - 1; i >= 0; i--) {
        extenti = source->dim[i].extent;
        stridei = source->dim[i].lstride;
        ioff += cntr[i] * stridei;
        if (i == 0) {
          /* Update for next time */
          j = 0;
          cntr[j]++;
          while (cntr[j] == extenti) {
            cntr[j++] = 0;
            if (j == source->rank)
              break;
            cntr[j]++;
            extenti = source->dim[j].extent;
          }
        }
      }
      size = rsize;
      if (size > *ss) {
        size = *ss;
      }
      __pgi_memcpy(rb, sb + (*ss) * (offset + ioff), size);
      rb += size;
      ssize -= size;
      rsize -= size;
    }
  }
  return;
}

__ATTRIBUTES int
__get_size_of(signed char *x)
{
  int i, y, ival;
  i = *x;
  if (i < 5)
    y = 2 * ((i + 1) >> 1);
  else if (i < 13) {
    ival = 0x00433233;
    y = 1 << (0xf & (ival >> (4 * (i - 5))));
  } else if (i < 21) {
    ival = 0x32103303;
    y = 1 << (0xf & (ival >> (4 * (i - 13))));
  } else if (i < 29) {
    ival = 0x32321132;
    y = 1 << (0xf & (ival >> (4 * (i - 21))));
  } else if (i < 37) {
    ival = 0x00000343;
    y = 1 << (0xf & (ival >> (4 * (i - 29))));
  } else {
    ival = 0x00544454;
    y = 1 << (0xf & (ival >> (4 * (i - 37))));
  }
  return y;
}
#define __REAL8 28
#define __REAL4 27

__ATTRIBUTES void
pghpf_norm2_nodim_i8(signed char *result, signed char *src, signed char * pfr,
                     signed char *result_kind, signed char *src_desc)
{
  int i, extent;
  int offset;
  int lbi, extenti, stridei, ioff;

  struct F90_Desc_la *source = (struct F90_Desc_la *)src_desc;

  /* Quick cases */
  if (source->len == 0) {
    double *dres = (double *)result;
    *dres = 0.0;
    return;

  } else if (source->tag != 35) {
    /* scalar */
    if (source->kind == __REAL8) {
      double *dsrc = (double *)src;
      double *dres = (double *)result;
      *dres = *dsrc;

    } else if (source->kind == __REAL4) {
      float *fsrc = (float *)src;
      double *dres = (double *)result;
      *dres = (double)(*fsrc);
    }
    return;

  } else if ((source->rank == 1) && (source->dim[0].lstride == 1)) {
    /* most common case */
    offset = source->lbase + source->dim[0].lbound - 1;
    if (source->kind == __REAL8) {
      double *dsrc = (double *)src;
      double *dres = (double *)result;
      double dnrm = 0.0;
      dsrc += offset;
      for (int i = 0; i < ((int)(source->dim[0].extent)); i++)
        dnrm += dsrc[i]*dsrc[i];
      *dres = sqrt(dnrm);

    } else if (source->kind == __REAL4) {
      float *fsrc = (float *)src;
      float *fres = (float *)result;
      double dnrm = 0.0;
      fsrc += offset;
      for (int i = 0; i < ((int)(source->dim[0].extent)); i++)
        dnrm += (((double)fsrc[i])*((double)(fsrc[i])));
      *fres = (float) sqrt(dnrm);
    }
    return;

  } else {

    /* handle all other cases */
    int n1,n2,n3,n4,n5,n6,n7;
    int i1,i2,i3,i4,i5,i6,i7;
    float  *fsrc = (float *)src;
    double *dsrc = (double *)src;
    double dnrm = 0.0;

    extent = 1; /* The next loop computes contiguous extent */
    offset = source->lbase;

    /* Walk through the source descriptor to compute offset */
    for (i = 0; i < source->rank; i++) {
      lbi = source->dim[i].lbound;
      extenti = source->dim[i].extent;
      stridei = source->dim[i].lstride;
      if (i == 0) {
        if (stridei != extent) {
          offset += stridei - lbi;
        } else {
          offset += lbi - 1;
        }
        if (stridei >= 0)
          extent = (extenti - 1) * stridei + extent;
        if (stridei < 0)
          extent = (extenti - 1) * stridei - extent;
      } else {
        offset += lbi * stridei;
        ioff = (extenti - 1) * stridei;
        if (ioff < 0)
          ioff = -ioff;
        if (extent >= 0)
          extent += ioff;
        if (extent < 0)
          extent -= ioff;
      }
    }

    dnrm = 0.0;
    /* This loop runs sequentially for each thread, so we carry strides around */
    n7 = (source->rank < 7) ? 1 : ((int) source->dim[6].extent); i7 = 0;
    do {
      n6 = (source->rank < 6) ? 1 : ((int) source->dim[5].extent); i6 = 0;
      do {
        n5 = (source->rank < 5) ? 1 : ((int) source->dim[4].extent); i5 = 0;
        do {
          n4 = (source->rank < 4) ? 1 : ((int) source->dim[3].extent); i4 = 0;
          do {
            n3 = (source->rank < 3) ? 1 : ((int) source->dim[2].extent); i3 = 0;
            do {
              n2 = (source->rank < 2) ? 1 : ((int) source->dim[1].extent); i2 = 0;
              do {
                n1 = ((int) source->dim[0].extent); i1 = 0;
                do {
                  i = offset+i7+i6+i5+i4+i3+i2+i1;
                  if (source->kind == __REAL8)
                    dnrm += dsrc[i]*dsrc[i];
                  if (source->kind == __REAL4)
                    dnrm += (((double)fsrc[i])*((double)(fsrc[i])));
                  i1 = i1 + (int)source->dim[0].lstride;
                } while (--n1 > 0);
                i2 = i2 + (int)source->dim[1].lstride;
              } while (--n2 > 0);
              i3 = i3 + (int)source->dim[2].lstride;
            } while (--n3 > 0);
            i4 = i4 + (int)source->dim[3].lstride;
          } while (--n4 > 0);
          i5 = i5 + (int)source->dim[4].lstride;
        } while (--n5 > 0);
        i6 = i6 + (int)source->dim[5].lstride;
      } while (--n6 > 0);
      i7 = i7 + (int)source->dim[6].lstride;
    } while (--n7 > 0);

    if (source->kind == __REAL8) {
      double *dres = (double *)result;
      *dres = sqrt(dnrm);
    } else if (source->kind == __REAL4) {
      float *fres = (float *)result;
      *fres = (float) sqrt(dnrm);
    }
  }
}

/* -------------------------------------------------------------------------- */

/* compute capability >= 2.0 */
#define __pgi_mul24(i, j) (i) * (j)
#define __pgi_umul24(i, j) (i) * (j)
#define __pgi_fdiv(x, y) (x) / (y)
#define __pgi_frcp(x) __frcp_rn(x)
#define __pgi_drcp(x) __drcp_rn(x)

#if !defined(PGI_COMPILE_BITCODE)
extern "C" __device__ int printf(const char *, ...);
#endif /* !defined(PGI_COMPILE_BITCODE) */

/* F90 IO */
__ATTRIBUTES void
__pgf90io_src_info(void *x, signed char *str, int lineno)
{
  /* stub routine, only used on the host for error reporting */
}

__ATTRIBUTES void
__pgf90io_src_info03(int n, void *x, signed char *str, long lineno)
{
  /* stub routine, only used on the host for error reporting */
}

__ATTRIBUTES void
__pgf90io_src_info03a(int n, void *x, signed char *str, long lineno)
{
  /* stub routine, only used on the host for error reporting */
}

__ATTRIBUTES void
pgf90_mp_bcs_nest_stub()
{
  /* stub routine, only used on the host for mp i/o */
}

__ATTRIBUTES void
pgf90_mp_ecs_nest_stub()
{
  /* stub routine, only used on the host for mp i/o */
}

__ATTRIBUTES int
__pgf90io_ldw_init(void *a, void *b, void *c, void *d, void *p)
{
  char *s;

  /* PGI list-directed i/o uses an 80 bit record length.
   * Need a few extras */
  *(char **)p = (char *)malloc(85);
  s = (char *)*(char **)p;
  s[0] = 2;
  for (int i = 1; i < 83; i++)
    s[i] = ' ';
  s[83] = '\0';
  s[84] = 0;
  return 0;
}

__ATTRIBUTES int
__pgf90io_print_init(void *a, void *b, void *c, void *d, void *p)
{
  char *s;

  /* PGI list-directed i/o uses an 80 bit record length.
   * Need a few extras */
  *(char **)p = (char *)malloc(85);
  s = (char *)*(char **)p;
  s[0] = 2;
  for (int i = 1; i < 83; i++)
    s[i] = ' ';
  s[83] = '\0';
  s[84] = 0;
  return 0;
}

__ATTRIBUTES int
__pgf90io_ldw_intern_inita(void *a, void *b, void *c, void *d, long long len, void *p)
{
  signed char *s;
  int ilen;

  /* Internal writes, need special case handling, no malloc */
  *(signed char **)p = (signed char *)(a);
  if (len > 0LL) {
    s = (signed char *)a;
    /* Limit here to buffers of length 128 */
    if (len > 128LL)
      ilen = 128;
    else
      ilen = len;
    for (int i = 0; i < ilen; i++)
      s[i] = ' ';

    s[ilen-1] = -1;  /* Last character has j, negated */
    s[0] = -(ilen); /* First character has index to j */
  }
  return 0;
}

__ATTRIBUTES int
__pgf90io_fmtw_init(void *a, void *b, void *c, void *d, signed char *fmt,
                    void *advp, long long advv, void *p)
{
  char *s;

  /* PGI list-directed i/o uses an 80 bit record length.
   * Need a few extras */
  *(char **)p = (char *)malloc(85);
  s = (char *)*(char **)p;
  s[0] = 2;
  for (int i = 1; i < 83; i++)
    s[i] = ' ';
  s[83] = '\0';
  s[84] = 0;
  return 0;
}

#define _OUTSIZE 104
#define _LDW_SPACE 1
#define _LDW_SP 2
#define _LDW_DP 4
#define _LDW_HP 8

/* Printing real data support */
__ATTRIBUTES int
__pgf90io_sc_fp_ldw(double x, signed char *ffmt, int w, int d, int ldw)
{
  int iexp, iwd0, iwdm1, iwdm2;
  unsigned int icarr;

  unsigned int iextb[_OUTSIZE];
  unsigned int *iexta;
  unsigned int *iextc;
  int i, j, k, kmx, firstdigit, pose, firstfwd /*, lastdigiteven*/;
  int ii, usee;
  unsigned int digit, ndigits, prdigits;
  int kbk, m, lw;
  int sgn, space;

  for (i = 0; i < _OUTSIZE; i++)
    iextb[i] = 0x0;
  iexta = &(iextb[70]);
  iextc = &(iextb[35]);

  { /* */
    unsigned long long lb0, lb1;
    unsigned int ibt0;
    lb0 = *((unsigned long long *)(&x));
    iexp = ((lb0 & 0x7ff0000000000000ULL) >> 52) - 1023;
    sgn = ((lb0 & 0x8000000000000000ULL) >> 63);
    lb1 = lb0 & 0x000fffffffffffffULL;
    if (iexp == 1024) {
      if (ldw & _LDW_DP)
        w++;
      if (ldw & _LDW_HP)
        w--;
      for (i = 0; i < w + 4; i++)
        ffmt[i] = ' ';
      if (lb1) {
        ffmt[w + 4] = 'N';
        ffmt[w + 5] = 'a';
        ffmt[w + 6] = 'N';
      } else {
        if (sgn)
          ffmt[w + 3] = '-';
        ffmt[w + 4] = 'I';
        ffmt[w + 5] = 'n';
        ffmt[w + 6] = 'f';
      }
      ffmt[w + 7] = '\0';
      return w + 7;
    }
    if (iexp + 1023) {
      lb1 |= 0x0010000000000000ULL; /* Put in implicit bit */
    } else {
      if (lb1) {
        iexp++; /* Adjust denormal exponent */
      } else {
        ffmt[0] = ' ';
        if (sgn)
          ffmt[1] = '-';
        else
          ffmt[1] = ' ';
        ffmt[2] = '0';
        ffmt[3] = '.';
        for (i = 4; i < w + 3; i++)
          ffmt[i] = '0';
        ffmt[w + 3] = '\0';
        return w + 3;
      }
    }
    if (iexp >= 0) {
      iwd0 = iexp / 32;
      ibt0 = iexp - (iwd0 * 32);
    } else {
      iwd0 = (iexp + 1) / 32;
      ibt0 = (iexp + 32) - (iwd0 * 32);
      iwd0 = iwd0 + 33;
    }
    iwdm1 = (iwd0) ? iwd0 - 1 : 33;
    iwdm2 = (iwdm1) ? iwdm1 - 1 : 33;

    /* Put the number into iexta */
    if (ibt0 < 20) {
      iexta[iwd0] = lb1 >> (52 - ibt0);
      iexta[iwdm1] = (lb1 >> (20 - ibt0)) & 0xffffffff;
      iexta[iwdm2] = (lb1 << (ibt0 + 12)) & 0xffffffff;
    } else {
      iexta[iwd0] = lb1 >> (52 - ibt0);
      iexta[iwdm1] = (lb1 << (ibt0 - 20)) & 0xffffffff;
    } /* */
  }

  iextc[0] = 1;
  kmx = 1;

  if (iexp >= 0) {
    /* Convert the whole number to base 1,000,000,000 */
    for (i = 0; i <= iwd0; i++) {
      for (j = 0; j < 32; j++) {
        if (iexta[i] & (1u << j)) {
          icarr = 0;
          for (k = 0; k < kmx; k++) {
            iextb[k] = iextb[k] + iextc[k] + icarr;
            if (iextb[k] >= 1000000000u) {
              iextb[k] = iextb[k] - 1000000000u;
              icarr = 1;
            } else {
              icarr = 0;
            }
          }
          iextb[kmx] += icarr;
        }
        icarr = 0;
        for (k = 0; k < kmx; k++) {
          iextc[k] = iextc[k] + iextc[k] + icarr;
          if (iextc[k] >= 1000000000u) {
            iextc[k] = iextc[k] - 1000000000u;
            icarr = 1;
          } else {
            icarr = 0;
          }
        }
        iextc[kmx] += icarr;
        kmx += icarr;
      }
    }
  }
  if (iextb[kmx - 1] == 0)
    kmx--;
  kmx--;

  prdigits = 0;
  if (kmx >= 0) {
    int c, rndval, sticky, rndup, loc;
    /* Find the first nonzero in the whole portion */
    digit = 1;
    k = iextb[kmx];
    for (i = 0; i < 9; i++) {
      if (k >= digit)
        pose = i;
      digit = digit * 10;
    }
    prdigits = kmx * 9 + pose + 1;
    if (prdigits > w) {
      loc = prdigits - w;
      kbk = 0;
      while (loc > 9) {
        loc -= 9;
        kbk++;
      }
      k = iextb[kbk];
      m = 1;
      c = 10;
      rndup = 0;
      sticky = 0;
      rndval = 0;
      if (loc > 1) {
        while (loc--) {
          j = k / 10;
          digit = k - (j * 10);
          if (loc > 0)
            rndup += (c - digit) * m;
          if (loc > 1)
            sticky |= (digit > 0);
          if (loc == 1)
            rndval = digit;
          k = j;
          m = m * 10;
          c = 9;
        }
        i = kbk;
        while (i && (!sticky)) {
          sticky |= (iextb[--i] != 0);
        }
      } else if (kbk > 0) {
        k = iextb[--kbk];
        if (k >= 500000000) {
          rndval = 5;
          rndup = 1000000000 - k;
          if (k > 500000000) {
            sticky = 1;
          } else {
            i = kbk - 1;
            while (i && (!sticky)) {
              sticky |= (iextb[i--] != 0);
            }
          }
        }
      }
      if ((rndval > 5) || ((rndval == 5) && sticky))
        iextb[kbk] += rndup;
      while (iextb[kbk] >= 1000000000) {
        iextb[kbk] = iextb[kbk] - 1000000000;
        iextb[kbk + 1] = iextb[kbk + 1] + 1;
        kbk++;
      }
    }
    /* Reverse the whole numbers now, so we can round into it when necessary */
    for (i = 0; i < (kmx + 1) / 2; i++) {
      k = iextb[i];
      iextb[i] = iextb[kmx - i];
      iextb[kmx - i] = k;
    }
  }

  /* Now the fraction bits */
  if ((iexp <= 51) && (prdigits <= w)) {
    unsigned int keepon, roundcarry, sh1, sh3;
    int savecarry, savek;
    digit = 0;
    keepon = 0;
    ndigits = 0;
    j = kmx + 1;
    k = kmx + 1;
    firstdigit = 0;
    savecarry = 0; roundcarry = 0;
    lw = w;
    while (iwdm2 < 34) {
      sh1 = 0;
      sh3 = 0;
      icarr = 0;
      for (i = iwdm2; i < 34; i++) {
        unsigned int ix, ixa, ixb, ixc;
        ix = iexta[i];
        ixa = (ix << 1) + sh1;
        ixb = (ix << 3) + sh3;
        sh1 = ix >> 31;
        sh3 = ix >> 29;
        ixc = ixa + ixb + icarr;
        icarr =
            ((ixc < ixa) || (ixc < ixb) || ((ixa & ixb & ixc) == 0xffffffff))
                ? 1
                : 0;
        iexta[i] = ixc;
      }
      if (iexta[iwdm2] == 0)
        iwdm2++;
      /* if (prdigits <= lw)
         lastdigiteven = ((digit / 2) * 2 == digit); */
      digit = (digit * 10) + (sh3 + sh1 + icarr);
      ++ndigits;
      if (digit && !firstdigit) {
        /* Now firstdigit always nonzero */
        firstdigit = ndigits;
        firstfwd = j;
        /* Need extra width for leading nonzero if required */
        if (((ndigits > 1) || (j != k)) && (prdigits == 0))
          lw++;
      }
      if (firstdigit || prdigits) {
        ++prdigits;
        if (prdigits > lw) {
          if ((!keepon) && ((sh3 + sh1 + icarr) < 5)) {
            /* Rounding to nearest, we're done */
            break;
          } else if ((!keepon) && ((sh3 + sh1 + icarr) > 5)) {
            digit += 10 - (sh3 + sh1 + icarr);
            break;
          } else if ((keepon) && ((sh3 + sh1 + icarr) > 0)) {
            if (savecarry) {
              digit = iextb[savek];
              digit += savecarry;
              k = savek;
              ndigits = 9;
            } else {
              roundcarry *= 10;
              digit += (roundcarry - (sh3 + sh1 + icarr));
            }
            keepon = 0;
            break;
          } else if (keepon) {
            keepon++;
            roundcarry *= 10;
          } else {
            roundcarry = 5;
            keepon++;
          }
        }
      }
      if (ndigits == 9) {
        if (digit || firstdigit || (iexp >= 0)) {
          iextb[k++] = digit;
        }
        j++;
        digit = 0;
        ndigits = 0;
        if ((keepon) && (!savecarry)) {
          savecarry = roundcarry;
          savek = k - 1;
        }
      }
    }
    if (keepon) {
      /*
       * Our Fortran i/o default is round compatible, so we don't
       * take lastdigiteven into account.
            if ((savecarry) && (lastdigiteven)) {
              digit = iextb[savek];
              digit += savecarry;
              k = savek;
              ndigits = 9;
            } else if ((roundcarry) && (lastdigiteven)) {
              digit += roundcarry;
            }
      */
      if (savecarry) {
        digit = iextb[savek];
        digit += savecarry;
        k = savek;
        ndigits = 9;
      } else if (roundcarry) {
        digit += roundcarry;
      }
    }
    if (digit) {
      for (; ndigits < 9; ndigits++) {
        digit = digit * 10;
      }
      iextb[k] = digit;
      if (digit == 1000000000) {
        digit = 0;
        iextb[k] = digit;
        kbk = k - 1;
        iextb[kbk]++;
        while (iextb[kbk] == 1000000000) {
          iextb[kbk] = 0;
          kbk = kbk - 1;
          iextb[kbk]++;
        }
        if (kmx < 0) {
          pose = 10 - firstdigit;
          if (pose >= 9)
            pose -= 9;
          digit = 1;
          for (i = 0; i < pose; i++) {
            digit = digit * 10;
          }
          if (iextb[kbk] == digit) {
            firstdigit--;
          }
        }
      }
    }
  }

  /* Find the first nonzero in the whole or fraction portion */
  k = iextb[0];
  digit = 1;
  /* These two loops can be combined... */
  for (i = 0; i < 9; i++) {
    if (k >= digit)
      pose = i;
    digit = digit * 10;
  }
  m = 1;
  /* A little space manipulation at the beginning */
  space = ldw & _LDW_SPACE;
  if (kmx >= 0) {
    iexp = kmx * 9 + pose;
    usee = (iexp >= w);
    if (!usee) {
      ffmt[0] = ' ';
      ffmt += space;
      m = 0;
    }
    if (sgn) {
      ffmt[0] = '-';
    } else {
      ffmt[0] = ' ';
    }
    ffmt += (space | sgn);
  } else {
    iexp = -(firstfwd * 9 + firstdigit);
    usee = (iexp < -1);
    if (sgn) {
      ffmt[0] = '-';
    } else {
      ffmt[0] = ' ';
    }
    ffmt += (space | sgn);
    if (!usee) {
      ffmt[0] = '0';
      ffmt[1] = '.';
      m = 2;
    }
  }

  if ((usee) && !(ldw & _LDW_HP))
    w++;
  kbk = 0;
  k = iextb[kbk++];
  for (i = pose; i >= 0; i--) {
    j = k / 10;
    k = k - (j * 10);
    if (i < w)
      ffmt[i + m] = '0' + k;
    k = j;
  }
  ii = pose + 1;
  if (kbk == kmx + 1) {
    ffmt[ii + m] = '.';
    if (ii < w)
      w++;
    if ((ii == w) && (!usee))
      w++;
    ii++;
  }
  while (ii < w) {
    k = iextb[kbk++];
    for (i = 8 + ii; i >= ii; i--) {
      j = k / 10;
      k = k - (j * 10);
      if (i < w)
        ffmt[i + m] = '0' + k;
      k = j;
    }
    ii += 9;
    if (kbk == kmx + 1) {
      ffmt[ii + m] = '.';
      if (ii < w)
        w++;
      if ((ii == w) && (!usee))
        w++;
      ii++;
    }
  }
  if (usee) {
    ffmt[0] = ffmt[1];
    ffmt[1] = '.';
    ffmt[w + 1] = 'E';
    if (iexp >= 0)
      ffmt[w + 2] = '+';
    else {
      ffmt[w + 2] = '-';
      iexp = -iexp;
    }
    if (ldw & _LDW_SP) {
      ffmt[w + 5] = '\0';
      j = iexp;
    } else if (ldw & _LDW_HP) {
      ffmt[w + 4] = '\0';
      k = iexp;
    } else {
      k = iexp;
      ffmt[w + 6] = '\0';
      j = k / 10;
      k = k - (j * 10);
      ffmt[w + 5] = '0' + k;
    }
    if (!(ldw & _LDW_HP)) {
      k = j / 10;
      j = j - (k * 10);
      ffmt[w + 4] = '0' + j;
      j = k / 10;
      k = k - (j * 10);
    }
    ffmt[w + 3] = '0' + k;
    return w + 6 + (space | sgn) - ((ldw & _LDW_SP) == _LDW_SP) -
                                 2*((ldw & _LDW_HP) == _LDW_HP);
  } else {
    if (iexp < 0) {
      ffmt[w + 2] = '\0';
      return w + 2 + (space | sgn);
    } else {
      ffmt[w] = '\0';
      return w + (space | sgn) + space;
    }
  }
}

/* print a double complex */
__ATTRIBUTES int
__pgf90io_sc_cd_ldw(double r, double i, int ftype, signed char *p)
{
  int j, k, len;
  j = p[0];
  if ((j + 49 >= 83) || p[84]) {
    p[j] = '\n';
    p[j + 1] = '\0';
    __pgi_print_string(&p[1]);
    for (k = 1; k < 83; k++)
      p[k] = ' ';
    j = 2;
    p[0] = j;
    p[83] = '\0';
    p[84] = 0;
  }
  if (j != 2)
    j++;
  p[j++] = '(';
  len = __pgf90io_sc_fp_ldw(r, p + j, 16, 0, 4);
  j += len;
  p[j++] = ',';
  len = __pgf90io_sc_fp_ldw(i, p + j, 16, 0, 4);
  j += len;
  p[j++] = ')';
  p[j++] = ' ';
  p[0] = j;
  return 0;
}

/* print a complex */
__ATTRIBUTES int
__pgf90io_sc_cf_ldw(float r, float i, int ftype, signed char *p)
{
  int j, k, len;
  j = p[0];
  if ((j + 29 >= 83) || p[84]) {
    p[j] = '\n';
    p[j + 1] = '\0';
    __pgi_print_string(&p[1]);
    for (k = 1; k < 83; k++)
      p[k] = ' ';
    j = 2;
    p[0] = j;
    p[83] = '\0';
    p[84] = 0;
  }
  if (j != 2)
    j++;
  p[j++] = '(';
  len = __pgf90io_sc_fp_ldw(r, p + j, 7, 0, 2);
  j += len;
  p[j++] = ',';
  len = __pgf90io_sc_fp_ldw(i, p + j, 7, 0, 2);
  j += len;
  p[j++] = ')';
  p[j++] = ' ';
  p[0] = j;
  return 0;
}

/* print a double */
__ATTRIBUTES int
__pgf90io_sc_d_ldw(double d, int ftype, signed char *p)
{
  int j, k, len;
  j = p[0];
  if (j > 0) {
    if ((j + 24 >= 83) || p[84]) {
      p[j] = '\n';
      p[j + 1] = '\0';
      __pgi_print_string(&p[1]);
      for (k = 1; k < 83; k++)
        p[k] = ' ';
      j = 2;
      p[0] = j;
      p[83] = '\0';
      p[84] = 0;
    }
    /*  if (j != 2) */
    j++;
    len = __pgf90io_sc_fp_ldw(d, p + j, 16, 0, 5);
    for (; len <= 24; len++)
      p[j + len] = ' ';
    p[0] = j + 25;
  } else {
    k = 1 - p[(-j)-1];
    len = __pgf90io_sc_fp_ldw(d, p + k, 16, 0, 5);
    for (; len <= 24; len++) {
      if ((k + len) < (-j))
        p[k + len] = ' ';
      else
	break;
    }
    if ((k + len) < (-j)) p[(-j)-1] = -(k + 25);
  }
  return 0;
}

/* print a float */
__ATTRIBUTES int
__pgf90io_sc_f_ldw(float f, int ftype, signed char *p)
{
  int j, k, len;
  j = p[0];
  if ((j + 14 >= 83) || p[84]) {
    p[j] = '\n';
    p[j + 1] = '\0';
    __pgi_print_string(&p[1]);
    for (k = 1; k < 83; k++)
      p[k] = ' ';
    j = 2;
    p[0] = j;
    p[83] = '\0';
    p[84] = 0;
  }
  /*  if (j != 2) */
    j++;
  len = __pgf90io_sc_fp_ldw(f, p + j, 7, 0, 3);
  for (; len <= 14; len++)
    p[j + len] = ' ';
  p[0] = j + 15;
  return 0;
}

/* print a integer*8 or logical*8 */
__ATTRIBUTES int
__pgf90io_sc_l_ldw(long long i, int itype, signed char *p)
{
  char t[28];
  int j, k, idx, ineg;
  long long lj, lk;
  j = p[0];
  k = (itype == 20) ? 3 : 25;
  if ((j + k >= 83) || p[84]) {
    p[j] = '\n';
    p[j + 1] = '\0';
    __pgi_print_string(&p[1]);
    for (k = 1; k < 83; k++)
      p[k] = ' ';
    j = 2;
    p[0] = j;
    p[83] = '\0';
    p[84] = 0;
  }
  if (itype == 20) {
    p[j++] = ' ';
    p[j++] = ' ';
    if (i)
      p[j++] = 'T';
    else
      p[j++] = 'F';
  } else if (i == 0x8000000000000000) {
    p[j++] = ' ';
    p[j++] = ' ';
    p[j++] = ' ';
    p[j++] = ' ';
    p[j++] = '-';
    p[j++] = '9';
    p[j++] = '2';
    p[j++] = '2';
    p[j++] = '3';
    p[j++] = '3';
    p[j++] = '7';
    p[j++] = '2';
    p[j++] = '0';
    p[j++] = '3';
    p[j++] = '6';
    p[j++] = '8';
    p[j++] = '5';
    p[j++] = '4';
    p[j++] = '7';
    p[j++] = '7';
    p[j++] = '5';
    p[j++] = '8';
    p[j++] = '0';
    p[j++] = '8';
    p[j++] = ' ';
  } else {
    if (i < 0) {
      i = -i;
      ineg = 1;
    } else {
      ineg = 0;
    }
    idx = 27;
    t[idx--] = ' ';
    if (i == 0)
      t[idx--] = '0';
    else {
      while (i > 0) {
        lj = i / 10;
        lk = i - (lj * 10);
        t[idx--] = '0' + lk;
        i = lj;
      }
    }
    if (ineg) {
      t[idx--] = '-';
    }
    while (idx > 0) {
      t[idx--] = ' ';
    }
    j = p[0];
    /* This spacing seems to match list directed host code */
    for (i = 3; i <= 27; i++) {
      p[j++] = t[i];
    }
  }
  p[0] = j;
  return 0;
}

/* print an integer*4 or logical*4 */
__ATTRIBUTES int
__pgf90io_sc_i_ldw(int i, int itype, signed char *p)
{
  char t[16];
  int j, k, idx, ineg;
  j = p[0];
  k = (itype == 19) ? 3 : 14;
  if ((j + k >= 83) || p[84]) {
    p[j] = '\n';
    p[j + 1] = '\0';
    __pgi_print_string(&p[1]);
    for (k = 1; k < 83; k++)
      p[k] = ' ';
    j = 2;
    p[0] = j;
    p[83] = '\0';
    p[84] = 0;
  }
  /* If type is BLOG, SLOG, or LOG logical type, then print T or F instead */
  if (itype == 17 || itype == 18 || itype == 19) {
    p[j++] = ' ';
    if (i)
      p[j++] = 'T';
    else
      p[j++] = 'F';
    p[j++] = ' ';
  } else if (i == 0x80000000) {
    p[j++] = ' ';
    p[j++] = '-';
    p[j++] = '2';
    p[j++] = '1';
    p[j++] = '4';
    p[j++] = '7';
    p[j++] = '4';
    p[j++] = '8';
    p[j++] = '3';
    p[j++] = '6';
    p[j++] = '4';
    p[j++] = '8';
    p[j++] = ' ';
  } else {
    if (i < 0) {
      i = -i;
      ineg = 1;
    } else {
      ineg = 0;
    }
    idx = 15;
    t[idx--] = ' ';
    if (i == 0)
      t[idx--] = '0';
    else {
      while (i > 0) {
        j = i / 10;
        k = i - (j * 10);
        t[idx--] = '0' + k;
        i = j;
      }
    }
    if (ineg) {
      t[idx--] = '-';
    }
    while (idx > 0) {
      t[idx--] = ' ';
    }
    j = p[0];
    /* This spacing seems to match list directed host code */
    for (i = 3; i <= 15; i++) {
      p[j++] = t[i];
    }
  }
  p[0] = j;
  return 0;
}

#define __INT1 32
#define __INT2 24
#define __BLOG 17
#define __SLOG 18

/* print an integer*1, integer*2, logical*1, or logical*2 */
__ATTRIBUTES int
__pgf90io_sc_ldw(int i, int itype, signed char *p)
{
  char t[16];
  int j, k, idx, ineg;
  j = p[0];
  if ((itype == __BLOG) || (itype == __SLOG))
    k = 3;
  else if (itype == __INT1)
    k = 6;
  else if (itype == __INT2)
    k = 8;
  else
    return 0;

  if ((j + k >= 83) || p[84]) {
    p[j] = '\n';
    p[j + 1] = '\0';
    __pgi_print_string(&p[1]);
    for (k = 1; k < 83; k++)
      p[k] = ' ';
    j = 2;
    p[0] = j;
    p[83] = '\0';
    p[84] = 0;
  }
  /* If type is BLOG or  SLOG, then print T or F */
  if ((itype == __BLOG) || (itype == __SLOG)) {
    p[j++] = ' ';
    if (i)
      p[j++] = 'T';
    else
      p[j++] = 'F';
    p[j++] = ' ';
  } else {
    if (i < 0) {
      i = -i;
      ineg = 1;
    } else {
      ineg = 0;
    }
    if (itype == __INT1) idx = 8;
    if (itype == __INT2) idx = 10;
    t[idx--] = ' ';
    if (i == 0)
      t[idx--] = '0';
    else {
      while (i > 0) {
        j = i / 10;
        k = i - (j * 10);
        t[idx--] = '0' + k;
        i = j;
      }
    }
    if (ineg) {
      t[idx--] = '-';
    }
    while (idx > 0) {
      t[idx--] = ' ';
    }
    j = p[0];
    if (itype == __INT1) idx = 8;
    if (itype == __INT2) idx = 10;
    /* This spacing seems to match list directed host code */
    for (i = 3; i <= idx; i++) {
      p[j++] = t[i];
    }
  }
  p[0] = j;
  return 0;
}

__ATTRIBUTES int
__pgf90io_sc_hf_fmt_write(unsigned short x, int itype, signed char *p)
{
  return 0;
}

__ATTRIBUTES int
__pgf90io_sc_f_fmt_write(float x, int itype, signed char *p)
{
  return 0;
}

__ATTRIBUTES int
__pgf90io_sc_i_fmt_write(int i, int itype, signed char *p)
{
  char t[16];
  int j, k, idx, ineg;
  j = p[0];
  k = (itype == 19) ? 3 : 14;
  if ((j + k >= 83) || p[84]) {
    p[j] = '\n';
    p[j + 1] = '\0';
    __pgi_print_string(&p[1]);
    for (k = 1; k < 83; k++)
      p[k] = ' ';
    j = 2;
    p[0] = j;
    p[83] = '\0';
    p[84] = 0;
  }
  if (itype == 19) {
    p[j++] = ' ';
    if (i)
      p[j++] = 'T';
    else
      p[j++] = 'F';
    p[j++] = ' ';
  } else {
    if (i < 0) {
      i = -i;
      ineg = 1;
    } else {
      ineg = 0;
    }
    idx = 15;
    t[idx--] = ' ';
    if (i == 0)
      t[idx--] = '0';
    else {
      while (i > 0) {
        j = i / 10;
        k = i - (j * 10);
        t[idx--] = '0' + k;
        i = j;
      }
    }
    if (ineg) {
      t[idx--] = '-';
    }
    while (idx > 0) {
      t[idx--] = ' ';
    }
    j = p[0];
    /* This spacing seems to match list directed host code */
    for (i = 3; i <= 15; i++) {
      p[j++] = t[i];
    }
  }
  p[0] = j;
  return 0;
}

/* print a character string into an internal buffer */
__ATTRIBUTES int
__pgf90io_sc_intern_ch_ldw(const void *s, int ftype, size_t len, signed char *p)
{
  int i,j,k;
  char *q = (char *)s;
  j = p[0];
  k = 1 - p[(-j)-1];
  if (k <= 0) return 0;
  for (i = 0; i < len; i++) {
    p[k++] = q[i];
    if (k == (-j)) break;
  }
  /* Store location, unless at the end of char string */
  if (k != (-j)) p[(-j)-1] = -k;
  return 0;
}

/* print a character string */
__ATTRIBUTES int
__pgf90io_sc_ch_ldw(const void *s, int ftype, size_t len, signed char *p)
{
  int i, j;
  char *q = (char *)s;
  j = p[0];
  if (j < 0) return __pgf90io_sc_intern_ch_ldw(s, ftype, len, p);
  if (((j > 2) && (j + len >= 83)) || p[84]) {
    /* print the previous record */
    p[j] = '\n';
    p[j + 1] = '\0';
    __pgi_print_string(&p[1]);
    j = 2;
    p[0] = j;
    for (i = 1; i < 83; i++)
      p[i] = ' ';
    p[83] = '\0';
    p[84] = 0;
  }
  /* print and/or store this record */
  p[1] = ' ';
  for (i = 0; i < len; i++) {
    p[j++] = q[i];
    /* Try to support % character, from string like threadIdx%x */
    /* Need to convert single % to %%, acceptable by printf */
    /* This can possibly throw off Fortran spacing, but better than crashing */
    /* If we see %%, assume user has figured out a workaround, so preserve that */
    if (q[i] == '%') {
      if (len == 1) {
        p[j++] = q[i];
      } else if (i == 0) {
          if (q[1] != '%') p[j++] = q[i];
      } else if (i == (len-1)) {
          if (q[i-1] != '%') p[j++] = q[i];
      } else if ((q[i-1] != '%') && (q[i+1] != '%')) {
        p[j++] = q[i];
      }
    }
    if (j >= 83) {
      p[j] = '\0';  /* No carriage return! */
      __pgi_print_string(&p[1]);
      for (j = 1; j < 83; j++)
        p[j] = ' ';
      j = 2;
      p[0] = j;
      p[83] = '\0';
      p[84]++;
    }
  }
  if ((j > 2) && (p[84] == 0))
    j++;
  p[0] = j;
  return 0;
}

/* end of line */
__ATTRIBUTES int
__pgf90io_ldw_end(signed char *p)
{
  signed char c;
  c = p[0];
  if (c > 0) {
    if (p[c - 1] == ' ')
      c--;
    p[c] = '\n';
    p[c + 1] = '\0';
    __pgi_print_string(&p[1]);
    free(p);
  } else {
    c = (-c)-1;
    if (p[c] < 0) p[c] = ' ';
    p[0] = ' ';
  }
  return 0;
}

__ATTRIBUTES int
__pgf90io_fmtw_end(signed char *p)
{
  char c;
  c = p[0];
  if (c > 0 && p[c - 1] == ' ')
    c--;
  p[c] = '\n';
  p[c + 1] = '\0';
  __pgi_print_string(&p[1]);
  free(p);
  return 0;
}

__ATTRIBUTES int
__pgi_launch_device(signed char *func, signed char *parameterBuffer,
                    signed char *gridDimension, signed char *blockDimension,
                    size_t sharedMemSize, size_t stream)
{
  extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(
      void *, void *, dim3, dim3, unsigned int, cudaStream_t);
  return cudaLaunchDevice((void *)func, *(void **)parameterBuffer,
                          *(dim3 *)gridDimension, *(dim3 *)blockDimension,
                          (unsigned int)sharedMemSize, (cudaStream_t)stream);
}

__ATTRIBUTES void
__pgi_get_parameter_buffer(signed char *bp, size_t align, size_t size)
{
  extern __device__ __cudart_builtin__ void *CUDARTAPI cudaGetParameterBuffer(
      size_t, size_t);
  *(void **)bp = cudaGetParameterBuffer(align, size);
}

__ATTRIBUTES void
__pgi_setup_argument(signed char *bp, signed char *arg, size_t ignored,
                     size_t offset)
{
  signed char *buffer = *(signed char **)bp;
  *(signed char **)(buffer + offset) = arg;
}

__ATTRIBUTES void
__pgi_setup_arg_by_val(signed char *bp, signed char *arg, size_t size,
                       size_t offset)
{
  signed char *buffer = *(signed char **)bp;
  for (size_t i = 0; i < size; i++)
    buffer[offset + i] = arg[i];
}

__ATTRIBUTES signed char *
pgf90_auto_alloc04(signed char *nelem, signed char *len)
{
  signed char *p = (signed char *)malloc(*(size_t *)nelem * *(int *)len);
  if (p == 0)
    __pgi_simple_error_msg(ERR_MSG_AUTO_ALLOC);
  return p;
}

__ATTRIBUTES signed char *
pgf90_auto_alloc04_i8(signed char *nelem, signed char *len)
{
  signed char *p = (signed char *)malloc(*(size_t *)nelem * *(size_t *)len);
  if (p == 0)
    __pgi_simple_error_msg(ERR_MSG_AUTO_ALLOC);
  return p;
}

__ATTRIBUTES void
pgf90_alloc04a(signed char *nelem, signed char *kind, signed char *len,
               signed char *stat, signed char *pointer, signed char *offset,
               signed char *firsttime, signed char *align, signed char *errmsg,
               size_t errlen)
{
  void *area;

  if (stat && *(int *)firsttime)
    *(int *)stat = 0;
  area = malloc(*(size_t *)nelem * *(int *)len);
  if (area == (void *)0) {
    if (stat)
      *(int *)stat = 99;
  } else
    *((void **)pointer) = area;
}

__ATTRIBUTES void
pgf90_alloc04(signed char *nelem, signed char *kind, signed char *len,
              signed char *stat, signed char *pointer, signed char *offset,
              signed char *firsttime, signed char *align, signed char *errmsg,
              size_t errlen)
{
  pgf90_alloc04a(nelem, kind, len, stat, pointer, offset, firsttime, align,
                 errmsg, errlen);
}

__ATTRIBUTES void
pgf90_ptr_alloc04a(signed char *nelem, signed char *kind, signed char *len,
              signed char *stat, signed char *pointer, signed char *offset,
              signed char *firsttime, signed char *align, signed char *errmsg,
              size_t errlen)
{
  /* on device ptr_alloc04 is equivalent to alloc04 since there is no check for allocated status */
  pgf90_alloc04a(nelem, kind, len, stat, pointer, offset, firsttime, align,
                 errmsg, errlen);
}

__ATTRIBUTES void
pgf90_alloc04a_i8(signed char *nelem, signed char *kind, signed char *len,
                  signed char *stat, signed char *pointer, signed char *offset,
                  signed char *firsttime, signed char *align,
                  signed char *errmsg, size_t errlen)
{
  void *area;

  if (stat && *(int *)firsttime)
    *(int *)stat = 0;
  area = malloc(*(size_t *)nelem * *(int *)len);
  if (area == (void *)0) {
    if (stat)
      *(int *)stat = 99;
  } else
    *((void **)pointer) = area;
}

__ATTRIBUTES void
pgf90_alloc04_i8(signed char *nelem, signed char *kind, signed char *len,
                 signed char *stat, signed char *pointer, signed char *offset,
                 signed char *firsttime, signed char *align,
                 signed char *errmsg, size_t errlen)
{
  pgf90_alloc04a_i8(nelem, kind, len, stat, pointer, offset, firsttime, align,
                    errmsg, errlen);
}

__ATTRIBUTES void
pgf90_ptr_alloc04a_i8(signed char *nelem, signed char *kind, signed char *len,
                 signed char *stat, signed char *pointer, signed char *offset,
                 signed char *firsttime, signed char *align,
                 signed char *errmsg, size_t errlen)
{
  /* on device ptr_alloc04a is equivalent to alloc04a since there is no check for allocated status */
  pgf90_alloc04a_i8(nelem, kind, len, stat, pointer, offset, firsttime, align,
                    errmsg, errlen);
}

__ATTRIBUTES void
pgf90_alloc04_chka(signed char *nelem, signed char *kind, signed char *len,
                   signed char *stat, signed char *pointer, signed char *offset,
                   signed char *firsttime, signed char *align,
                   signed char *errmsg, size_t errlen)
{
  void *area;

  if (*pointer) {
    ; /* do something here */
  } else {

    if (stat && *(int *)firsttime)
      *(int *)stat = 0;

    area = malloc(*(size_t *)nelem * *(int *)len);
    if (area == (void *)0) {
      if (stat)
        *(int *)stat = 99;
    } else
      *((void **)pointer) = area;
  }
}

__ATTRIBUTES void
pgf90_alloc04_chk(signed char *nelem, signed char *kind, signed char *len,
                     signed char *stat, signed char *pointer,
                     signed char *offset, signed char *firsttime,
                     signed char *align, signed char *errmsg, size_t errlen)
{
  pgf90_alloc04_chka(nelem, kind, len, stat, pointer, offset, firsttime, align,
                     errmsg, errlen);
}

__ATTRIBUTES void
pgf90_alloc04_chka_i8(signed char *nelem, signed char *kind, signed char *len,
                      signed char *stat, signed char *pointer,
                      signed char *offset, signed char *firsttime,
                      signed char *align, signed char *errmsg, size_t errlen)
{
  void *area;

  if (*(long long**)pointer) {
    ; /* do something here */
  } else {

    if (stat && *(long long *)firsttime)
      *(long long *)stat = 0;

    area = malloc((*(long long*)nelem) * (*(long long*)len));
    if (area == (void *)0) {
      if (stat)
        *(long long *)stat = 99;
    } else
      *((long long **)pointer) = (long long*)area;
  }
}

__ATTRIBUTES void
pgf90_alloc04_chk_i8(signed char *nelem, signed char *kind, signed char *len,
                     signed char *stat, signed char *pointer,
                     signed char *offset, signed char *firsttime,
                     signed char *align, signed char *errmsg, size_t errlen)
{
  pgf90_alloc04_chka_i8(nelem, kind, len, stat, pointer, offset, firsttime,
                        align, errmsg, errlen);
}

__ATTRIBUTES void
pgf90_dev_mod_alloc04(signed char *nelem, signed char *kind, signed char *len,
                      signed char *stat, signed char *pointer,
                      signed char *handle, signed char *count,
                      signed char *offset, signed char *firsttime,
                      signed char *align, signed char *errmsg, size_t errlen)
{
  void *area;

  if (stat && *(int *)firsttime)
    *(int *)stat = 0;
  area = malloc(*(size_t *)nelem * *(int *)len);
  if (area == (void *)0) {
    if (stat)
      *(int *)stat = 99;
  } else
    *((void **)pointer) = area;
}

__ATTRIBUTES void
pgf90_dev_mod_alloc04_i8(signed char *nelem, signed char *kind,
                         signed char *len, signed char *stat,
                         signed char *pointer, signed char *handle,
                         signed char *count, signed char *offset,
                         signed char *firsttime, signed char *align,
                         signed char *errmsg, size_t errlen)
{
  void *area;

  if (stat && *(int *)firsttime)
    *(int *)stat = 0;
  area = malloc(*(size_t *)nelem * *(int *)len);
  if (area == (void *)0) {
    if (stat)
      *(int *)stat = 99;
  } else
    *((void **)pointer) = area;
}

__ATTRIBUTES void
pgf90_auto_dealloc(signed char *pointer)
{
  free(pointer);
}

__ATTRIBUTES void
pgf90_auto_dealloc_i8(signed char *pointer)
{
  free(pointer);
}

__ATTRIBUTES void
pgf90_dev_mod_dealloc03(signed char *stat, signed char *pointer,
                        signed char *handle, signed char *area,
                        signed char *firsttime, signed char *errmsg,
                        size_t errlen)
{
  free(*(void **)pointer);
  if (stat && *(int *)firsttime)
    *(int *)stat = 0;
}

__ATTRIBUTES int
__pgi_device_get_cache_config(signed char *pcc)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaDeviceGetCacheConfig(enum
  cudaFuncCache *);
  */
  return cudaDeviceGetCacheConfig((enum cudaFuncCache *)pcc);
}

__ATTRIBUTES int
__pgi_device_get_limit(signed char *pval, int limit)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaDeviceGetLimit(size_t *, enum
  cudaLimit);
  */
  return cudaDeviceGetLimit((size_t *)pval, (enum cudaLimit)limit);
}

__ATTRIBUTES int
__pgi_get_last_error()
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaGetLastError();
  */
  return cudaGetLastError();
}

__ATTRIBUTES int
__pgi_peek_at_last_error()
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaPeekAtLastError();
  */
  return cudaPeekAtLastError();
}

__ATTRIBUTES void
__pgi_get_error_string(signed char *string, int len, int error)
{
  /*
  __device__ extern const char* CUDARTAPI cudaGetErrorString(cudaError_t);
  */
  const char *src;
  src = cudaGetErrorString((cudaError_t)error);
  while (len-- > 0) {
    if (*src == 0)
      *string++ = ' ';
    else
      *string++ = *src++;
  }
}

__ATTRIBUTES void
pgf90_str_free(signed char *ptr)
{
  /* this would free the string variable, but here we ignore it */
  return;
}

__ATTRIBUTES void
pgf90_str_cpy1(signed char *dst, size_t dlen, const signed char *src,
               size_t slen)
{
  while (dlen-- > 0) {
    if (slen > 0) {
      *dst++ = *src++;
      slen--;
    } else
      *dst++ = ' ';
  }
}

/* Fix this once we have support for varargs */
__ATTRIBUTES void
pgf90_str_copy(int n, signed char *dst, size_t dlen, const signed char *src,
               size_t slen)
{
  if (n == 1) {
    while (dlen-- > 0) {
      if (slen > 0) {
        *dst++ = *src++;
        slen--;
      } else
        *dst++ = ' ';
    }
  }
}

#if !defined(PGI_COMPILE_BITCODE)
__ATTRIBUTES void
pgf90_str_copy(int n, signed char *dst, size_t dlen, const signed char *src1,
               size_t slen1, const signed char *src2, size_t slen2)
{
  if (n == 2) {
    while (dlen-- > 0) {
      if (slen1 > 0) {
        *dst++ = *src1++;
        slen1--;
      } else if (slen2 > 0) {
        *dst++ = *src2++;
        slen2--;
      } else
        *dst++ = ' ';
    }
  }
}

__ATTRIBUTES void
pgf90_str_copy(int n, signed char *dst, size_t dlen, const signed char *src1,
               size_t slen1, const signed char *src2, size_t slen2,
               const signed char *src3, size_t slen3)
{
  if (n == 3) {
    while (dlen-- > 0) {
      if (slen1 > 0) {
        *dst++ = *src1++;
        slen1--;
      } else if (slen2 > 0) {
        *dst++ = *src2++;
        slen2--;
      } else if (slen3 > 0) {
        *dst++ = *src3++;
        slen3--;
      } else
        *dst++ = ' ';
    }
  }
}

__ATTRIBUTES void
pgf90_str_copy(int n, signed char *dst, size_t dlen, const signed char *src1,
               size_t slen1, const signed char *src2, size_t slen2,
               const signed char *src3, size_t slen3, const signed char *src4,
               size_t slen4)
{
  if (n == 4) {
    while (dlen-- > 0) {
      if (slen1 > 0) {
        *dst++ = *src1++;
        slen1--;
      } else if (slen2 > 0) {
        *dst++ = *src2++;
        slen2--;
      } else if (slen3 > 0) {
        *dst++ = *src3++;
        slen3--;
      } else if (slen4 > 0) {
        *dst++ = *src4++;
        slen4--;
      } else
        *dst++ = ' ';
    }
  }
}
#endif /* !defined(PGI_COMPILE_BITCODE) */

__ATTRIBUTES int
pgf90_strcmp(signed char *a1, signed char *a2, size_t d1_len, size_t d2_len)
{
  int ret_val, idx1, a1_len, a2_len;
  a1_len = d1_len;
  a2_len = d2_len;
  if (a1_len < 0)
    a1_len = 0;
  if (a2_len < 0)
    a2_len = 0;
  if (a1_len == a2_len) {
    while (a1_len > 0) {
      if (*a1 != *a2) {
        if ((unsigned)(*a1) > (unsigned)(*a2))
          return 1;
        return -1;
      }
      ++a1;
      ++a2;
      a1_len--;
    }
    return 0;
  }
  if (a1_len > a2_len) {
    /* first compare the first a2_len characters of the strings */
    ret_val = __pgi_memcmp(a1, a2, a2_len);
    if (ret_val != 0) {
      if (ret_val < 0)
        return (-1);
      if (ret_val > 0)
        return (1);
    }
    /*
     * if the last (a1_len - a2_len) characters of a1 are blank, then the
     * strings are equal; otherwise, compare the first non-blank char. to
     * blank
     */

    for (idx1 = 0; idx1 < (a1_len - a2_len); idx1++) {
      if (a1[a2_len + idx1] != ' ') {
        if (a1[a2_len + idx1] > ' ')
          return (1);
        return (-1);
      }
    }
    return (0);
  } else {
    /* a2_len > a1_len */
    /* first compare the first a1_len characters of the strings */
    ret_val = __pgi_memcmp(a1, a2, a1_len);
    if (ret_val != 0) {
      if (ret_val < 0)
        return (-1);
      if (ret_val > 0)
        return (1);
    }
    /*
     * if the last (a2_len - a1_len) characters of a2 are blank, then the
     * strings are equal; otherwise, compare the first non-blank char. to
     * blank
     */

    for (idx1 = 0; idx1 < (a2_len - a1_len); idx1++) {
      if (a2[a1_len + idx1] != ' ') {
        if (a2[a1_len + idx1] > ' ')
          return (-1);
        return (1);
      }
    }
    return (0);
  }
}

__ATTRIBUTES int
pgf90_allocated(signed char *p, ERR_PROTYPE)
{
  if (p == 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_ALLOCATED);
  return (p != 0);
}

__ATTRIBUTES int
pgf90_allocated_i8(signed char *p, ERR_PROTYPE)
{
  if (p == 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_ALLOCATED);
  return (p != 0);
}

__ATTRIBUTES int
pgf90_allocated2(signed char *p, ERR_PROTYPE)
{
  if (p == 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_ALLOCATED);
  return (p != 0);
}

__ATTRIBUTES int
pgf90_allocated2_i8(signed char *p, ERR_PROTYPE)
{
  if (p == 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_ALLOCATED);
  return (p != 0);
}

__ATTRIBUTES int
pgf90_allocated_lhs(signed char *p, ERR_PROTYPE)
{
  if (p == 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_ALLOCATED_LHS);
  return (p != 0);
}

__ATTRIBUTES int
pgf90_allocated_lhs_i8(signed char *p, ERR_PROTYPE)
{
  if (p == 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_ALLOCATED_LHS);
  return (p != 0);
}

__ATTRIBUTES int
pgf90_allocated_lhs2(signed char *p, ERR_PROTYPE)
{
  if (p == 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_ALLOCATED_LHS);
  return (p != 0);
}

__ATTRIBUTES int
pgf90_allocated_lhs2_i8(signed char *p, ERR_PROTYPE)
{
  if (p == 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_ALLOCATED_LHS);
  return (p != 0);
}

__ATTRIBUTES void
pgf90_dealloc03a(signed char *stat, signed char *area, signed char *firsttime,
                 signed char *errmsg, size_t lenmsg)
{
  free((void *)area);
}

__ATTRIBUTES void
pgf90_dealloc03(signed char *stat, signed char *area, signed char *firsttime,
                signed char *errmsg, size_t lenmsg)
{
  pgf90_dealloc03a(stat, area, firsttime, errmsg, lenmsg);
}

__ATTRIBUTES void
pgf90_dealloc03a_i8(signed char *stat, signed char *area,
                    signed char *firsttime, signed char *errmsg, size_t lenmsg)
{
  free((void *)area);
}

__ATTRIBUTES void
pgf90_dealloc03_i8(signed char *stat, signed char *area, signed char *firsttime,
                   signed char *errmsg, size_t lenmsg)
{
  pgf90_dealloc03a_i8(stat, area, firsttime, errmsg, lenmsg);
}

__ATTRIBUTES void
pgf90_dealloc_mbr03a(signed char *stat, signed char *area,
                     signed char *firsttime, size_t dummy)
{
  pgf90_dealloc03a(stat, area, firsttime, 0, 0);
}

__ATTRIBUTES void
pgf90_dealloc_mbr03a_i8(signed char *stat, signed char *area,
                        signed char *firsttime, size_t dummy)
{
  pgf90_dealloc03a(stat, area, firsttime, 0, 0);
}

__ATTRIBUTES int
__pgi_get_device_count(signed char *count)
{
  /*
  __device__ extern cudaError_t CUDARTAPI cudaGetDeviceCount(int *);
  */
  return cudaGetDeviceCount((int *)count);
}

__ATTRIBUTES int
__pgi_get_device_properties(signed char *props, int device)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaGetDeviceProperties(struct
  cudaDeviceProp *, int device);
  */
  int i;
  cudaError_t stat;
  struct cudaDeviceProp *p = (struct cudaDeviceProp *)props;
  stat = cudaGetDeviceProperties(p, device);
  for (i = 0; i < 256; i++)
    if (p->name[i] == '\0')
      break;
  while (i < 256)
    p->name[i++] = ' ';
  return stat;
}

__ATTRIBUTES int
__pgi_get_device(signed char *device)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaGetDevice(int *);
  */
  return cudaGetDevice((int *)device);
}

__ATTRIBUTES int
__pgi_stream_create_with_flags(signed char *stream, unsigned int flags)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI
  cudaStreamCreateWithFlags(cudaStream_t *, unsigned int);
  */
  return cudaStreamCreateWithFlags((cudaStream_t *)stream, flags);
}

__ATTRIBUTES int
__pgi_stream_destroy(long stream)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaStreamDestroy(cudaStream_t);
  */
  return cudaStreamDestroy((cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_stream_wait_event(long stream, long event, unsigned int flags)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaStreamWaitEvent(cudaStream_t,
  cudaEvent_t, unsigned int);
  */
  return cudaStreamWaitEvent((cudaStream_t)stream, (cudaEvent_t)event, flags);
}

__ATTRIBUTES int
__pgi_event_create_with_flags(signed char *event, unsigned int flags)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t
  *, unsigned int);
  */
  return cudaEventCreateWithFlags((cudaEvent_t *)event, flags);
}

__ATTRIBUTES int
__pgi_event_record(long event, long stream)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaEventRecord(cudaEvent_t,
  cudaStream_t);
  */
  return cudaEventRecord((cudaEvent_t)event, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_event_destroy(long event)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaEventDestroy(cudaEvent_t);
  */
  return cudaEventDestroy((cudaEvent_t)event);
}

__ATTRIBUTES int
__pgi_func_get_attributes(signed char *attr, signed char *func)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaFuncGetAttributes(struct
  cudaFuncAttributes *, const void *);
  */
  return cudaFuncGetAttributes((struct cudaFuncAttributes *)attr,
                               (const void *)func);
}

__ATTRIBUTES int
__pgi_runtime_get_version(signed char *ver)
{
  /*
  __device__ extern cudaError_t  CUDARTAPI cudaRuntimeGetVersion(int *);
  */
  return cudaRuntimeGetVersion((int *)ver);
}

/* cudaMemcpyAsync wrappers */
/*
__device__ extern cudaError_t  CUDARTAPI cudaMemcpyAsync(void *, const void *,
size_t, enum cudaMemcpyKind, cudaStream_t);
*/

__ATTRIBUTES int
__pgi_memcpy_asyncc1(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asynci1(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asynci2(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count * 2, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asynci4(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count * 4, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asynci8(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count * 8, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asyncl1(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asyncl2(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count * 2, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asyncl4(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count * 4, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asyncl8(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count * 8, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asyncr4(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count * 4, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asyncr8(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count * 8, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asyncc8(signed char *dst, signed char *src, long count, int kind,
                     long stream)
{
  return cudaMemcpyAsync(dst, src, count * 8, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_asyncc16(signed char *dst, signed char *src, long count, int kind,
                      long stream)
{
  return cudaMemcpyAsync(dst, src, count * 16, (enum cudaMemcpyKind)kind,
                         (cudaStream_t)stream);
}

/* cudaMemcpy2DAsync wrappers */
/*
__device__ extern cudaError_t  CUDARTAPI cudaMemcpy2DAsync(void *, size_t, const
void *, size_t, size_t, size_t, enum cudaMemcpyKind, cudaStream_t);
*/

__ATTRIBUTES int
__pgi_memcpy_2d_asyncc1(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch, src, dpitch, width, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asynci1(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch, src, dpitch, width, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asynci2(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch * 2, src, dpitch * 2, width * 2, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asynci4(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch * 4, src, dpitch * 4, width * 4, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asynci8(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch * 8, src, dpitch * 8, width * 8, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asyncl1(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch, src, dpitch, width, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asyncl2(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch * 2, src, dpitch * 2, width * 2, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asyncl4(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch * 4, src, dpitch * 4, width * 4, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asyncl8(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch * 8, src, dpitch * 8, width * 8, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asyncr4(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch * 4, src, dpitch * 4, width * 4, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asyncr8(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch * 8, src, dpitch * 8, width * 8, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asyncc8(signed char *dst, long dpitch, signed char *src,
                        long spitch, long width, long height, int kind,
                        long stream)
{
  return cudaMemcpy2DAsync(dst, spitch * 8, src, dpitch * 8, width * 8, height,
                           (enum cudaMemcpyKind)kind, (cudaStream_t)stream);
}

__ATTRIBUTES int
__pgi_memcpy_2d_asyncc16(signed char *dst, long dpitch, signed char *src,
                         long spitch, long width, long height, int kind,
                         long stream)
{
  return cudaMemcpy2DAsync(dst, spitch * 16, src, dpitch * 16, width * 16,
                           height, (enum cudaMemcpyKind)kind,
                           (cudaStream_t)stream);
}

/* cudaMalloc wrappers */
/*
__device__ extern cudaError_t  CUDARTAPI cudaMalloc(void **, size_t);
*/

__ATTRIBUTES int
__pgi_cuda_mallocc1(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n);
}

__ATTRIBUTES int
__pgi_cuda_malloci1(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n);
}

__ATTRIBUTES int
__pgi_cuda_malloci2(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n * 2);
}

__ATTRIBUTES int
__pgi_cuda_malloci4(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n * 4);
}

__ATTRIBUTES int
__pgi_cuda_malloci8(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n * 8);
}

__ATTRIBUTES int
__pgi_cuda_mallocl1(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n);
}

__ATTRIBUTES int
__pgi_cuda_mallocl2(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n * 2);
}

__ATTRIBUTES int
__pgi_cuda_mallocl4(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n * 4);
}

__ATTRIBUTES int
__pgi_cuda_mallocl8(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n * 8);
}

__ATTRIBUTES int
__pgi_cuda_mallocr4(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n * 4);
}

__ATTRIBUTES int
__pgi_cuda_mallocr8(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n * 8);
}

__ATTRIBUTES int
__pgi_cuda_mallocc8(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n * 8);
}

__ATTRIBUTES int
__pgi_cuda_mallocc16(signed char *p, long n, signed char *desc)
{
  return cudaMalloc((void **)p, n * 16);
}

/* cudaFree wrappers */
/*
__device__ extern cudaError_t  CUDARTAPI cudaFree(void *);
*/

__ATTRIBUTES int
__pgi_cuda_freec1(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freei1(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freei2(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freei4(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freei8(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freel1(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freel2(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freel4(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freel8(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freer4(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freer8(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freec8(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES int
__pgi_cuda_freec16(signed char *p, signed char *desc)
{
  cudaError_t stat;
  if ((stat = cudaFree(*(void **)p)) == cudaSuccess)
    *(void **)p = 0;
  return stat;
}

__ATTRIBUTES cmplx2
__pgi_scmplxadd(cmplx2 x, cmplx2 y)
{
  return SCMPLX(REAL(x) + REAL(y), IMAG(x) + IMAG(y));
}

__ATTRIBUTES cmplx2
__pgi_scmplxadd_llvm(cmplx2 x, cmplx2 y)
{
  return SCMPLX(REAL(x) + REAL(y), IMAG(x) + IMAG(y));
}

__ATTRIBUTES cmplx2
__pgi_scmplxsub(cmplx2 x, cmplx2 y)
{
  return SCMPLX(REAL(x) - REAL(y), IMAG(x) - IMAG(y));
}

__ATTRIBUTES cmplx2
__pgi_scmplxsub_llvm(cmplx2 x, cmplx2 y)
{
  return SCMPLX(REAL(x) - REAL(y), IMAG(x) - IMAG(y));
}

__ATTRIBUTES cmplx2
__pgi_scmplxmul(cmplx2 x, cmplx2 y)
{
  return SCMPLX(REAL(x) * REAL(y) - IMAG(x) * IMAG(y),
                REAL(x) * IMAG(y) + IMAG(x) * REAL(y));
}

__ATTRIBUTES cmplx2
__pgi_scmplxmul_llvm(cmplx2 x, cmplx2 y)
{
  return SCMPLX(REAL(x) * REAL(y) - IMAG(x) * IMAG(y),
                REAL(x) * IMAG(y) + IMAG(x) * REAL(y));
}

__ATTRIBUTES dcmplx2
__pgi_dcmplxadd(dcmplx2 x, dcmplx2 y)
{
  return DCMPLX(REAL(x) + REAL(y), IMAG(x) + IMAG(y));
}

__ATTRIBUTES dcmplx2
__pgi_dcmplxadd_llvm(dcmplx2 x, dcmplx2 y)
{
  return DCMPLX(REAL(x) + REAL(y), IMAG(x) + IMAG(y));
}

__ATTRIBUTES dcmplx2
__pgi_dcmplxsub(dcmplx2 x, dcmplx2 y)
{
  return DCMPLX(REAL(x) - REAL(y), IMAG(x) - IMAG(y));
}

__ATTRIBUTES dcmplx2
__pgi_dcmplxsub_llvm(dcmplx2 x, dcmplx2 y)
{
  return DCMPLX(REAL(x) - REAL(y), IMAG(x) - IMAG(y));
}

__ATTRIBUTES dcmplx2
__pgi_dcmplxmul(dcmplx2 x, dcmplx2 y)
{
  return DCMPLX(REAL(x) * REAL(y) - IMAG(x) * IMAG(y),
                REAL(x) * IMAG(y) + IMAG(x) * REAL(y));
}

__ATTRIBUTES dcmplx2
__pgi_dcmplxmul_llvm(dcmplx2 x, dcmplx2 y)
{
  return DCMPLX(REAL(x) * REAL(y) - IMAG(x) * IMAG(y),
                REAL(x) * IMAG(y) + IMAG(x) * REAL(y));
}

__ATTRIBUTES dcmplx2
__pgi_shfl_downdc2(dcmplx2 x, unsigned int delta)
{
  return DCMPLX(__pgi_shfl_downd2(REAL(x), delta),
                __pgi_shfl_downd2(IMAG(x), delta));
}

__ATTRIBUTES cmplx2
__pgi_shfl_downsc2(cmplx2 x, unsigned int delta)
{
  return SCMPLX(__pgi_shfl_downf2(REAL(x), delta),
                __pgi_shfl_downf2(IMAG(x), delta));
}

__ATTRIBUTES dcmplx2
__pgi_atomicAddcd(void *address, dcmplx2 val)
{
  return DCMPLX(__pgi_atomicAddd(address, REAL(val)),
                __pgi_atomicAddd((void *)((char *)(address) + 8), IMAG(val)));
}

__ATTRIBUTES void
__pgi_atomicAddcmplx(void *address, float realval, float imgval)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull, assumed;
  unsigned long long int newval;
  cmplx2 rval;

  do {
    assumed = old;
    rval = *((cmplx2 *)&assumed);
    rval.r += realval;
    rval.i += imgval;
    newval = *((unsigned long long int *)&rval);
    old = atomicCAS(address_as_ull, assumed, newval);
  } while (assumed != old);
}

__ATTRIBUTES void
__pgi_atomicSubcmplx(void *address, float realval, float imgval)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull, assumed;
  unsigned long long int newval;
  cmplx2 rval;

  do {
    assumed = old;
    rval = *((cmplx2 *)&assumed);
    rval.r -= realval;
    rval.i -= imgval;
    newval = *((unsigned long long int *)&rval);
    old = atomicCAS(address_as_ull, assumed, newval);
  } while (assumed != old);
}

__ATTRIBUTES cmplx2
__pgi_atomicAddcf(void *address, cmplx2 val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull, assumed;
  unsigned long long int newval;
  cmplx2 rval;

  do {
    assumed = old;
    rval = *((cmplx2 *)&assumed);
    rval.r += val.r;
    rval.i += val.i;
    newval = *((unsigned long long int *)&rval);
    old = atomicCAS(address_as_ull, assumed, newval);
  } while (assumed != old);
  return *((cmplx2 *)&old);
}

__ATTRIBUTES cmplx2
__pgi_atomicSubcf(void *address, cmplx2 val)
{
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull, assumed;
  unsigned long long int newval;
  cmplx2 rval;

  do {
    assumed = old;
    rval = *((cmplx2 *)&assumed);
    rval.r -= val.r;
    rval.i -= val.i;
    newval = *((unsigned long long int *)&rval);
    old = atomicCAS(address_as_ull, assumed, newval);
  } while (assumed != old);
  return *((cmplx2 *)&old);
}

__ATTRIBUTES void
__pgi_vbarrier()
{
  /* increment to avoid clash with default logical barrier 0x0 */
  unsigned int bnum = __pgi_threadidx(2) + 1;
  unsigned int numt = __pgi_numthreads(1);
  if (__pgi_numthreads(2) == 1) {
    __syncthreads(); /* only have one worker */
  } else if (numt <= 32) {
#if !defined(PGI_COMPILE_BITCODE)
    __syncwarp(0xffffffff);
#else /* defined(PGI_COMPILE_BITCODE) */
    __pgi_syncwarp();
#endif /* defined(PGI_COMPILE_BITCODE) */
  } else if (numt == 64) {
    /* would like to use a variable here, but that isn't working */
    asm volatile("bar.sync %0, 64;" ::"r"(bnum));
  } else if (numt == 96) {
    asm volatile("bar.sync %0, 96;" ::"r"(bnum));
  } else if (numt == 128) {
    asm volatile("bar.sync %0, 128;" ::"r"(bnum));
  } else if (numt == 160) {
    asm volatile("bar.sync %0, 160;" ::"r"(bnum));
  } else if (numt == 192) {
    asm volatile("bar.sync %0, 192;" ::"r"(bnum));
  } else if (numt == 224) {
    asm volatile("bar.sync %0, 224;" ::"r"(bnum));
  } else if (numt == 256) {
    asm volatile("bar.sync %0, 256;" ::"r"(bnum));
  } else if (numt == 288) {
    asm volatile("bar.sync %0, 288;" ::"r"(bnum));
  } else if (numt == 320) {
    asm volatile("bar.sync %0, 320;" ::"r"(bnum));
  } else if (numt == 352) {
    asm volatile("bar.sync %0, 352;" ::"r"(bnum));
  } else if (numt == 384) {
    asm volatile("bar.sync %0, 384;" ::"r"(bnum));
  } else if (numt == 416) {
    asm volatile("bar.sync %0, 416;" ::"r"(bnum));
  } else if (numt == 448) {
    asm volatile("bar.sync %0, 448;" ::"r"(bnum));
  } else if (numt == 480) {
    asm volatile("bar.sync %0, 480;" ::"r"(bnum));
  } else if (numt == 512) {
    asm volatile("bar.sync %0, 512;" ::"r"(bnum));
  } else {
    /* with a max thread block of 1024, we can't have vector > 512 and worker >
     * 1 */
  }
}

/* version used when the number of workers is known to be <= two */
__ATTRIBUTES void
__pgi_vbarrier2()
{
  /* increment to avoid clash with default logical barrier 0x0 */
  unsigned int bnum = __pgi_threadidx(2) + 1;
  unsigned int numt = __pgi_numthreads(1);
  if (__pgi_numthreads(2) == 1) {
    __syncthreads(); /* only have one worker */
  } else if (numt <= 32) {
#if !defined(PGI_COMPILE_BITCODE)
    __syncwarp(0xffffffff);
#else /* defined(PGI_COMPILE_BITCODE) */
    __pgi_syncwarp();
#endif /* defined(PGI_COMPILE_BITCODE) */
  } else if (bnum == 1) {
    asm volatile("bar.sync 1, %0;" ::"r"(numt));
  } else if (bnum == 2) {
    asm volatile("bar.sync 2, %0;" ::"r"(numt));
  }
}

/* version used when the number of workers is known to be <= four */
__ATTRIBUTES void
__pgi_vbarrier4()
{
  /* increment to avoid clash with default logical barrier 0x0 */
  unsigned int bnum = __pgi_threadidx(2) + 1;
  unsigned int numt = __pgi_numthreads(1);
  if (__pgi_numthreads(2) == 1) {
    __syncthreads(); /* only have one worker */
  } else if (numt <= 32) {
#if !defined(PGI_COMPILE_BITCODE)
    __syncwarp(0xffffffff);
#else /* defined(PGI_COMPILE_BITCODE) */
    __pgi_syncwarp();
#endif /* defined(PGI_COMPILE_BITCODE) */
  } else if (bnum == 1) {
    asm volatile("bar.sync 1, %0;" ::"r"(numt));
  } else if (bnum == 2) {
    asm volatile("bar.sync 2, %0;" ::"r"(numt));
  } else if (bnum == 3) {
    asm volatile("bar.sync 3, %0;" ::"r"(numt));
  } else if (bnum == 4) {
    asm volatile("bar.sync 4, %0;" ::"r"(numt));
  }
}

/* version used when the number of workers is known to be <= eight */
__ATTRIBUTES void
__pgi_vbarrier8()
{
  /* increment to avoid clash with default logical barrier 0x0 */
  unsigned int bnum = __pgi_threadidx(2) + 1;
  unsigned int numt = __pgi_numthreads(1);
  if (__pgi_numthreads(2) == 1) {
    __syncthreads(); /* only have one worker */
  } else if (numt <= 32) {
#if !defined(PGI_COMPILE_BITCODE)
    __syncwarp(0xffffffff);
#else /* defined(PGI_COMPILE_BITCODE) */
    __pgi_syncwarp();
#endif /* defined(PGI_COMPILE_BITCODE) */
  } else if (bnum == 1) {
    asm volatile("bar.sync 1, %0;" ::"r"(numt));
  } else if (bnum == 2) {
    asm volatile("bar.sync 2, %0;" ::"r"(numt));
  } else if (bnum == 3) {
    asm volatile("bar.sync 3, %0;" ::"r"(numt));
  } else if (bnum == 4) {
    asm volatile("bar.sync 4, %0;" ::"r"(numt));
  } else if (bnum == 5) {
    asm volatile("bar.sync 5, %0;" ::"r"(numt));
  } else if (bnum == 6) {
    asm volatile("bar.sync 6, %0;" ::"r"(numt));
  } else if (bnum == 7) {
    asm volatile("bar.sync 7, %0;" ::"r"(numt));
  } else if (bnum == 8) {
    asm volatile("bar.sync 8, %0;" ::"r"(numt));
  }
}

__ATTRIBUTES int
__pgi_all(int a)
{
  int result;
  asm volatile("{ \n\t"
               ".reg .pred \t%%p1; \n\t"
               ".reg .pred \t%%p2; \n\t"
               "setp.ne.u32 \t%%p1, %1, 0; \n\t"
               "vote.sync.all.pred \t%%p2, %%p1, 0xffffffff; \n\t"
               "selp.s32 \t%0, 1, 0, %%p2; \n\t"
               "}"
               : "=r"(result)
               : "r"(a));
  return result;
}

__ATTRIBUTES int
__pgi_any(int a)
{
  int result;
  asm volatile("{ \n\t"
               ".reg .pred \t%%p1; \n\t"
               ".reg .pred \t%%p2; \n\t"
               "setp.ne.u32 \t%%p1, %1, 0; \n\t"
               "vote.sync.any.pred \t%%p2, %%p1, 0xffffffff; \n\t"
               "selp.s32 \t%0, 1, 0, %%p2; \n\t"
               "}"
               : "=r"(result)
               : "r"(a));
  return result;
}

__ATTRIBUTES int
__pgi_ballot(int a)
{
  int result;
  asm volatile("{ \n\t"
               ".reg .pred \t%%p1; \n\t"
               "setp.ne.u32 \t%%p1, %1, 0; \n\t"
               "vote.sync.ballot.b32 \t%0, %%p1, 0xffffffff; \n\t"
               "}"
               : "=r"(result)
               : "r"(a));
  return result;
}

__ATTRIBUTES void
pgf90_mcopy1(void *d, void *s, size_t size)
{
  if (d && s && size > 0) {
    char *dd = (char *)d, *ss = (char *)s;
    while (size--)
      *dd++ = *ss++;
  }
}

__ATTRIBUTES void
pgf90_mcopy2(void *d, void *s, size_t size)
{
  if (d && s && size > 0) {
    short *dd = (short *)d, *ss = (short *)s;
    while (size--)
      *dd++ = *ss++;
  }
}

__ATTRIBUTES void
pgf90_mcopy4(void *d, void *s, size_t size)
{
  if (d && s && size > 0) {
    int *dd = (int *)d, *ss = (int *)s;
    while (size--)
      *dd++ = *ss++;
  }
}

__ATTRIBUTES void
pgf90_mcopy8(void *d, void *s, size_t size)
{
  if (d && s && size > 0) {
    long long *dd = (long long *)d, *ss = (long long *)s;
    while (size--)
      *dd++ = *ss++;
  }
}

__ATTRIBUTES void
pgf90_mcopyz8(void *d, void *s, size_t size)
{
  if (d && s && size > 0) {
    int *dd = (int *)d, *ss = (int *)s;
    while (size--) {
      *dd++ = *ss++;
      *dd++ = *ss++;
    }
  }
}
__ATTRIBUTES void
pgf90_mcopyz16(void *d, void *s, size_t size)
{
  if (d && s && size > 0) {
    long long *dd = (long long *)d, *ss = (long long *)s;
    while (size--) {
      *dd++ = *ss++;
      *dd++ = *ss++;
    }
  }
}

__ATTRIBUTES void
pgf90_mset1(void *d, void *s, size_t size)
{
  if (d && size > 0) {
    char *dd = (char *)d, ss = *(char *)s;
    while (size--)
      *dd++ = ss;
  }
}

__ATTRIBUTES void
pgf90_mset2(void *d, void *s, size_t size)
{
  if (d && size > 0) {
    short *dd = (short *)d, ss = *(short *)s;
    while (size--)
      *dd++ = ss;
  }
}

__ATTRIBUTES void
pgf90_mset4(void *d, void *s, size_t size)
{
  if (d && size > 0) {
    int *dd = (int *)d, ss = *(int *)s;
    while (size--)
      *dd++ = ss;
  }
}

__ATTRIBUTES void
pgf90_mset8(void *d, void *s, size_t size)
{
  if (d && size > 0) {
    long long *dd = (long long *)d, ss = *(long long *)s;
    while (size--)
      *dd++ = ss;
  }
}

__ATTRIBUTES void
pgf90_msetz8(void *d, void *s, size_t size)
{
  if (d && size > 0) {
    int *dd = (int *)d, ss0 = ((int *)s)[0], ss1 = ((int *)s)[1];
    while (size--) {
      *dd++ = ss0;
      *dd++ = ss1;
    }
  }
}

__ATTRIBUTES void
pgf90_msetz16(void *d, void *s, size_t size)
{
  if (d && size > 0) {
    long long *dd = (long long *)d, ss0 = ((long long *)s)[0],
              ss1 = ((long long *)s)[1];
    while (size--) {
      *dd++ = ss0;
      *dd++ = ss1;
    }
  }
}

__ATTRIBUTES void
pgf90_mzero1(void *d, size_t size)
{
  if (d && size > 0) {
    char *dd = (char *)d;
    while (size--)
      *dd++ = 0;
  }
}

__ATTRIBUTES void
pgf90_mzero2(void *d, size_t size)
{
  if (d && size > 0) {
    short *dd = (short *)d;
    while (size--)
      *dd++ = 0;
  }
}

__ATTRIBUTES void
pgf90_mzero4(void *d, size_t size)
{
  if (d && size > 0) {
    int *dd = (int *)d;
    while (size--)
      *dd++ = 0;
  }
}

__ATTRIBUTES void
pgf90_mzero8(void *d, size_t size)
{
  if (d && size > 0) {
    long long *dd = (long long *)d;
    while (size--)
      *dd++ = 0;
  }
}

__ATTRIBUTES void
pgf90_mzeroz8(void *d, size_t size)
{
  if (d && size > 0) {
    int *dd = (int *)d;
    while (size--) {
      *dd++ = 0;
      *dd++ = 0;
    }
  }
}

__ATTRIBUTES void
pgf90_mzeroz16(void *d, size_t size)
{
  if (d && size > 0) {
    long long *dd = (long long *)d;
    while (size--) {
      *dd++ = 0;
      *dd++ = 0;
    }
  }
}

__ATTRIBUTES void
__pgi_managed_delete(void *addr)
{
    /* Do nothing on device for now */
}

#if !defined(PGI_COMPILE_BITCODE)
extern "C" __device__ double __nv_sqrt(double);
#endif /* !defined(PGI_COMPILE_BITCODE) */

/* Wrapper for __cxa_vec_ctor for C++. Enable only on platforms where pgc++ is
 * supported. */
#if !defined(_WIN64)
#if defined(PGI_COMPILE_BITCODE)
__device__ void __cxa_vec_ctor(void *array_address, size_t element_count,
                               size_t element_size, void (*constructor)(void *),
                               void (*destructor)(void *));

__ATTRIBUTES void
__pgi_cxa_vec_ctor(void *array_address, size_t element_count,
                   size_t element_size, signed char *constructor,
                   signed char *destructor)
{
  void (*foo)(void *);
  void (*bar)(void *);
  foo = (void (*)(void *))constructor;
  bar = (void (*)(void *))destructor;
  __cxa_vec_ctor(array_address, element_count, element_size, foo, bar);
}
#else /* !defined(PGI_COMPILE_BITCODE) */
extern "C" __device__ void
__cxa_vec_ctor(void *array_address, size_t element_count, size_t element_size,
               void (*constructor)(void *), void (*destructor)(void *));

extern "C" __ATTRIBUTES void
__pgi_cxa_vec_ctor(void *array_address, size_t element_count,
                   size_t element_size, signed char *constructor,
                   signed char *destructor)
{
  void (*foo)(void *);
  void (*bar)(void *);
  foo = (void (*)(void *))constructor;
  bar = (void (*)(void *))destructor;
  __cxa_vec_ctor(array_address, element_count, element_size, foo, bar);
}
#endif /* !defined(PGI_COMPILE_BITCODE) */
#endif /* !defined(_WIN64) */

__ATTRIBUTES void
pgf90_copy_f77_argl(void *ab, void *ad, void *afirst, void *db, ERR_PROTYPE)
{
  struct F90_Desc *srcdesc = (struct F90_Desc *)ad;
  if (!(srcdesc->flags & __SEQUENTIAL_SECTION)) {
    if (__abort)
      __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONTIGUOUS);
  }
  *((void **)db) = afirst;
}

__ATTRIBUTES void
pgf90_copy_f77_argsl(void *ab, void *ad, void *afirst, void *db, ERR_PROTYPE)
{
  struct F90_Desc *srcdesc = (struct F90_Desc *)ad;
  if (!(srcdesc->flags & __SEQUENTIAL_SECTION)) {
    if (__abort)
      __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONTIGUOUS);
  }
  *((void **)db) = afirst;
}

__ATTRIBUTES void
pgf90_copy_f90_argl(void *ab, void *ad, void *db, void *dd, ERR_PROTYPE)
{
  struct F90_Desc *srcdesc = (struct F90_Desc *)ad;
  struct F90_Desc *tgtdesc = (struct F90_Desc *)dd;
  int i;
  if (srcdesc->dim[0].lstride != 1) {
    if (__abort)
      __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_STRIDE1);
  }
  pgf90_init_section(tgtdesc, srcdesc->rank, srcdesc);
  for (i = 0; i < srcdesc->rank; ++i)
    tgtdesc->dim[i] = srcdesc->dim[i];
  *((void **)db) = *((void**)ab);
}

__ATTRIBUTES void
pgf90_copy_f77_argl_i8(void *ab, void *ad, void *afirst, void *db, ERR_PROTYPE)
{
  struct F90_Desc_la *srcdesc = (struct F90_Desc_la *)ad;
  if (!(srcdesc->flags & __SEQUENTIAL_SECTION)) {
    if (__abort)
      __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONTIGUOUS);
  }
  *((void **)db) = afirst;
}

__ATTRIBUTES void
pgf90_copy_f77_argsl_i8(void *ab, void *ad, void *afirst, void *db, ERR_PROTYPE)
{
  struct F90_Desc_la *srcdesc = (struct F90_Desc_la *)ad;
  if (!(srcdesc->flags & __SEQUENTIAL_SECTION)) {
    if (__abort)
      __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONTIGUOUS);
  }
  *((void **)db) = afirst;
}

__ATTRIBUTES void
pgf90_copy_f90_argl_i8(void *ab, void *ad, void *db, void *dd, ERR_PROTYPE)
{
  struct F90_Desc_la *srcdesc = (struct F90_Desc_la *)ad;
  struct F90_Desc_la *tgtdesc = (struct F90_Desc_la *)dd;
  int i;
  if (srcdesc->dim[0].lstride != 1) {
    if (__abort)
      __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_STRIDE1);
  }
  pgf90_init_section_la(tgtdesc, srcdesc->rank, srcdesc);
  for (i = 0; i < srcdesc->rank; ++i)
    tgtdesc->dim[i] = srcdesc->dim[i];
  *((void **)db) = *((void**)ab);
}

__ATTRIBUTES void
__pgi_exit()
{
  asm volatile("exit;");
}

#define F90_RANK_G(p) ((p)->rank)
#define F90_DIM_EXTENT_G(p, i) (p->dim[i].extent)
#define F90_GSIZE_G(p) ((p)->gsize)

/*
 * conformable functions, used in Fortran application
 * */
__ATTRIBUTES int
pgf90_conformable_dd(void *db, void *dd, void *sd, ERR_PROTYPE)
{
  struct F90_Desc *srcdesc = (struct F90_Desc *)sd;
  struct F90_Desc *dstdesc = (struct F90_Desc *)dd;
  signed char *pb = (signed char *)db;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */
  int ndim;
  int i;

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  ndim = F90_RANK_G(srcdesc);
  for (i = 0; i < ndim; i++) {
    if (F90_DIM_EXTENT_G(dstdesc, i) != F90_DIM_EXTENT_G(srcdesc, i)) {
      conformable = -1;
      break;
    }
  }

  if (conformable != 1 && F90_GSIZE_G(dstdesc) >= F90_GSIZE_G(srcdesc)) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_dd */

__ATTRIBUTES int
pgf90_conformable_d1v(void *db, void *dd, int ext0, ERR_PROTYPE)
{
  struct F90_Desc *dstdesc = (struct F90_Desc *)dd;
  signed char *pb = (signed char *)db;
  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  if (F90_DIM_EXTENT_G(dstdesc, 0) != ext0) {
    conformable = -1;
  }

  if (conformable != 1 && F90_GSIZE_G(dstdesc) >= ext0) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_d1v */

__ATTRIBUTES int
pgf90_conformable_d2v(void *db, void *dd, int ext0, int ext1, ERR_PROTYPE)
{
  struct F90_Desc *dstdesc = (struct F90_Desc *)dd;
  signed char *pb = (signed char *)db;
  int gsize = 1;
  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */
  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  gsize = ext0 * ext1;
  if (F90_DIM_EXTENT_G(dstdesc, 0) != ext0 ||
      F90_DIM_EXTENT_G(dstdesc, 1) != ext1) {
    conformable = -1;
  }

  if (conformable != 1 && F90_GSIZE_G(dstdesc) >= gsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_d2v */

__ATTRIBUTES int
pgf90_conformable_d3v(void *db, void *dd, int ext0, int ext1, int ext2, ERR_PROTYPE)
{
  struct F90_Desc *dstdesc = (struct F90_Desc *)dd;
  signed char *pb = (signed char *)db;
  int gsize = 1;
  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  gsize = ext0 * ext1 * ext2;
  if (F90_DIM_EXTENT_G(dstdesc, 0) != ext0 ||
      F90_DIM_EXTENT_G(dstdesc, 1) != ext1 ||
      F90_DIM_EXTENT_G(dstdesc, 2) != ext2) {
    conformable = -1;
  }

  if (conformable != 1 && F90_GSIZE_G(dstdesc) >= gsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_d3v */

__ATTRIBUTES int
pgf90_conformable_dnv(void *db, void *dd, int ndim, int ext0, int ext1,
                      int ext2, int ext3, int ext4, int ext5, int ext6,
                      ERR_PROTYPE)
{
  struct F90_Desc *dstdesc = (struct F90_Desc *)dd;
  int extent[F90_DESC_MAXDIMS];
  signed char *pb = (signed char *)db;
  int i;
  int gsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */
  extent[0] = ext0;
  extent[1] = ext1;
  extent[2] = ext2;
  extent[3] = ext3;
  extent[4] = ext4;
  extent[5] = ext5;
  extent[6] = ext6;

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  for (i = 0; i < ndim; i++) {
    gsize *= extent[i];
    if (F90_DIM_EXTENT_G(dstdesc, i) != extent[i]) {
      conformable = -1;
      break;
    }
  }

  if (conformable != 1 && F90_GSIZE_G(dstdesc) >= gsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_dnv */

__ATTRIBUTES int
pgf90_conformable_1dv(void *db, void *sd, int ext0, ERR_PROTYPE)
{
  struct F90_Desc *srcdesc = (struct F90_Desc *)sd;
  signed char *pb = (signed char *)db;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */
  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  if (F90_DIM_EXTENT_G(srcdesc, 0) != ext0) {
    conformable = -1;
  }

  if (conformable != 1 && ext0 >= F90_GSIZE_G(srcdesc)) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_1dv */

__ATTRIBUTES int
pgf90_conformable_2dv(void *db, void *sd, int ext0, int ext1, ERR_PROTYPE)
{
  struct F90_Desc *srcdesc = (struct F90_Desc *)sd;
  signed char *pb = (signed char *)db;
  int gsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  gsize = ext0 * ext1;
  if (F90_DIM_EXTENT_G(srcdesc, 0) != ext0 ||
      F90_DIM_EXTENT_G(srcdesc, 1) != ext1) {
    conformable = -1;
  }

  if (conformable != 1 && gsize >= F90_GSIZE_G(srcdesc)) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_2dv */

__ATTRIBUTES int
pgf90_conformable_3dv(void *db, void *sd, int ext0, int ext1, int ext2, ERR_PROTYPE)
{
  struct F90_Desc *srcdesc = (struct F90_Desc *)sd;
  signed char *pb = (signed char *)db;
  int gsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  gsize = ext0 * ext1 * ext2;
  if (F90_DIM_EXTENT_G(srcdesc, 0) != ext0 ||
      F90_DIM_EXTENT_G(srcdesc, 1) != ext1 ||
      F90_DIM_EXTENT_G(srcdesc, 2) != ext2) {
    conformable = -1;
  }

  if (conformable != 1 && F90_GSIZE_G(srcdesc) >= gsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_3dv */

__ATTRIBUTES int
pgf90_conformable_ndv(void *db, void *sd, int ndim, int ext0, int ext1,
                      int ext2, int ext3, int ext4, int ext5, int ext6,
                      ERR_PROTYPE)
{
  struct F90_Desc *srcdesc = (struct F90_Desc *)sd;
  int extent[F90_DESC_MAXDIMS];
  signed char *pb = (signed char *)db;
  int i;
  int gsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */
  extent[0] = ext0;
  extent[1] = ext1;
  extent[2] = ext2;
  extent[3] = ext3;
  extent[4] = ext4;
  extent[5] = ext5;
  extent[6] = ext6;

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  for (i = 0; i < ndim; i++) {
    gsize *= extent[i];
    if (F90_DIM_EXTENT_G(srcdesc, i) != extent[i]) {
      conformable = -1;
      break;
    }
  }

  if (conformable != 1 && F90_GSIZE_G(srcdesc) >= gsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_ndv */

__ATTRIBUTES int
pgf90_conformable_11v(void *db, int dext0, int sext0, ERR_PROTYPE)
{
  signed char *pb = (signed char *)db;
  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  if (sext0 != dext0) {
    conformable = -1;
  }

  if (conformable != 1 && dext0 >= sext0) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_11v */

__ATTRIBUTES int
pgf90_conformable_22v(void *db, int dext0, int sext0, int dext1, int sext1, ERR_PROTYPE)
{
  signed char *pb = (signed char *)db;
  int sgsize = 1, dgsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  sgsize = sext0 * sext1;
  dgsize = dext0 * dext1;
  if (dext0 != sext0 || dext1 != sext1) {
    conformable = -1;
  }

  if (conformable != 1 && dgsize >= sgsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_22v */

__ATTRIBUTES int
pgf90_conformable_33v(void *db, int dext0, int sext0, int dext1, int sext1,
                      int dext2, int sext2, ERR_PROTYPE)
{
  signed char *pb = (signed char *)db;
  int sgsize = 1, dgsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  sgsize = sext0 * sext1 * sext2;
  dgsize = dext0 * dext1 * dext2;
  if (dext0 != sext0 || dext1 != sext1 || dext2 != sext2) {
    conformable = -1;
  }

  if (conformable != 1 && dgsize >= sgsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_33v */

__ATTRIBUTES int
pgf90_conformable_nnv(void *db, int ndim, int dext0, int sext0, int dext1,
                      int sext1, int dext2, int sext2, int dext3, int sext3,
                      int dext4, int sext4, int dext5, int sext5, int dext6,
                      int sext6, ERR_PROTYPE)
{
  int sextent[F90_DESC_MAXDIMS];
  int dextent[F90_DESC_MAXDIMS];
  signed char *pb = (signed char *)db;
  int i;
  int sgsize = 1, dgsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */
  sextent[0] = sext0;
  sextent[1] = sext1;
  sextent[2] = sext2;
  sextent[3] = sext3;
  sextent[4] = sext4;
  sextent[5] = sext5;
  sextent[6] = sext6;

  dextent[0] = dext0;
  dextent[1] = dext1;
  dextent[2] = dext2;
  dextent[3] = dext3;
  dextent[4] = dext4;
  dextent[5] = dext5;
  dextent[6] = dext6;

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  for (i = 0; i < ndim; i++) {
    sgsize *= sextent[i];
    dgsize *= dextent[i];
    if (sextent[i] != dextent[i]) {
      conformable = -1;
      break;
    }
  }

  if (conformable != 1 && dgsize >= sgsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_nnv */

/* Large Array Version */

/*
 * conformable functions, used in Fortran application
 * */
__ATTRIBUTES int
pgf90_conformable_dd_i8(void *db, void *dd, void *sd, ERR_PROTYPE)
{
  long long ndim;
  int i;
  struct F90_Desc_la *srcdesc = (struct F90_Desc_la *)sd;
  struct F90_Desc_la *dstdesc = (struct F90_Desc_la *)dd;
  signed char *pb = (signed char *)db;
  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  ndim = F90_RANK_G(srcdesc);
  for (i = 0; i < ndim; i++) {
    if (F90_DIM_EXTENT_G(dstdesc, i) != F90_DIM_EXTENT_G(srcdesc, i)) {
      conformable = -1;
      break;
    }
  }

  if (conformable != 1 && F90_GSIZE_G(dstdesc) >= F90_GSIZE_G(srcdesc)) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_dd_i8 */

__ATTRIBUTES int
pgf90_conformable_d1v_i8(void *db, void *dd, long long ext0, ERR_PROTYPE)
{
  struct F90_Desc_la *dstdesc = (struct F90_Desc_la *)dd;
  signed char *pb = (signed char *)db;
  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  if (F90_DIM_EXTENT_G(dstdesc, 0) != ext0) {
    conformable = -1;
  }

  if (conformable != 1 && F90_GSIZE_G(dstdesc) >= ext0) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_d1v_i8 */

__ATTRIBUTES int
pgf90_conformable_d2v_i8(void *db, void *dd, long long ext0, long long ext1, ERR_PROTYPE)
{
  struct F90_Desc_la *dstdesc = (struct F90_Desc_la *)dd;
  signed char *pb = (signed char *)db;
  long long gsize = 1;
  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */
  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  gsize = ext0 * ext1;
  if (F90_DIM_EXTENT_G(dstdesc, 0) != ext0 ||
      F90_DIM_EXTENT_G(dstdesc, 1) != ext1) {
    conformable = -1;
  }

  if (conformable != 1 && F90_GSIZE_G(dstdesc) >= gsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_d2v_i8 */

__ATTRIBUTES int
pgf90_conformable_d3v_i8(void *db, void *dd, long long ext0, long long ext1,
                         long long ext2, ERR_PROTYPE)
{
  struct F90_Desc_la *dstdesc = (struct F90_Desc_la *)dd;
  signed char *pb = (signed char *)db;
  long long gsize = 1;
  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  gsize = ext0 * ext1 * ext2;
  if (F90_DIM_EXTENT_G(dstdesc, 0) != ext0 ||
      F90_DIM_EXTENT_G(dstdesc, 1) != ext1 ||
      F90_DIM_EXTENT_G(dstdesc, 2) != ext2) {
    conformable = -1;
  }

  if (conformable != 1 && F90_GSIZE_G(dstdesc) >= gsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_d3v_i8 */

__ATTRIBUTES int
pgf90_conformable_dnv_i8(void *db, void *dd, long long ndim, long long ext0,
                         long long ext1, long long ext2, long long ext3,
                         long long ext4, long long ext5, long long ext6,
                         ERR_PROTYPE)
{
  int i;
  struct F90_Desc_la *dstdesc = (struct F90_Desc_la *)dd;
  long long extent[F90_DESC_MAXDIMS];
  signed char *pb = (signed char *)db;
  long long gsize = 1;
  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */
  extent[0] = ext0;
  extent[1] = ext1;
  extent[2] = ext2;
  extent[3] = ext3;
  extent[4] = ext4;
  extent[5] = ext5;
  extent[6] = ext6;

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  for (i = 0; i < ndim; i++) {
    gsize *= extent[i];
    if (F90_DIM_EXTENT_G(dstdesc, i) != extent[i]) {
      conformable = -1;
      break;
    }
  }

  if (conformable != 1 && F90_GSIZE_G(dstdesc) >= gsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_dnv_i8 */

__ATTRIBUTES int
pgf90_conformable_1dv_i8(void *db, void *sd, long long ext0, ERR_PROTYPE)
{
  struct F90_Desc_la *srcdesc = (struct F90_Desc_la *)sd;
  signed char *pb = (signed char *)db;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  if (F90_DIM_EXTENT_G(srcdesc, 0) != ext0) {
    conformable = -1;
  }

  if (conformable != 1 && ext0 >= F90_GSIZE_G(srcdesc)) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_1dv_i8 */

__ATTRIBUTES int
pgf90_conformable_2dv_i8(void *db, void *sd, long long ext0, long long ext1, ERR_PROTYPE)
{
  struct F90_Desc_la *srcdesc = (struct F90_Desc_la *)sd;
  signed char *pb = (signed char *)db;
  long long gsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  gsize = ext0 * ext1;
  if (F90_DIM_EXTENT_G(srcdesc, 0) != ext0 ||
      F90_DIM_EXTENT_G(srcdesc, 1) != ext1) {
    conformable = -1;
  }

  if (conformable != 1 && gsize >= F90_GSIZE_G(srcdesc)) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_2dv_i8 */

__ATTRIBUTES int
pgf90_conformable_3dv_i8(void *db, void *sd, long long ext0, long long ext1,
                         long long ext2, ERR_PROTYPE)
{
  struct F90_Desc_la *srcdesc = (struct F90_Desc_la *)sd;
  signed char *pb = (signed char *)db;
  long long gsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  gsize = ext0 * ext1 * ext2;
  if (F90_DIM_EXTENT_G(srcdesc, 0) != ext0 ||
      F90_DIM_EXTENT_G(srcdesc, 1) != ext1 ||
      F90_DIM_EXTENT_G(srcdesc, 2) != ext2) {
    conformable = -1;
  }

  if (conformable != 1 && F90_GSIZE_G(srcdesc) >= gsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_3dv_i8 */

__ATTRIBUTES int
pgf90_conformable_ndv_i8(void *db, void *sd, long long ndim, long long ext0,
                         long long ext1, long long ext2, long long ext3,
                         long long ext4, long long ext5, long long ext6,
                         ERR_PROTYPE)
{
  struct F90_Desc_la *srcdesc = (struct F90_Desc_la *)sd;
  long long extent[F90_DESC_MAXDIMS];
  signed char *pb = (signed char *)db;
  int i;
  long long gsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */
  extent[0] = ext0;
  extent[1] = ext1;
  extent[2] = ext2;
  extent[3] = ext3;
  extent[4] = ext4;
  extent[5] = ext5;
  extent[6] = ext6;

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  for (i = 0; i < ndim; i++) {
    gsize *= extent[i];
    if (F90_DIM_EXTENT_G(srcdesc, i) != extent[i]) {
      conformable = -1;
      break;
    }
  }

  if (conformable != 1 && F90_GSIZE_G(srcdesc) >= gsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_ndv_i8 */

__ATTRIBUTES int
pgf90_conformable_11v_i8(void *db, long long dext0, long long sext0, ERR_PROTYPE)
{
  signed char *pb = (signed char *)db;
  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  if (sext0 != dext0) {
    conformable = -1;
  }

  if (conformable != 1 && dext0 >= sext0) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_11v_i8 */

__ATTRIBUTES int
pgf90_conformable_22v_i8(void *db, long long dext0, long long sext0,
                         long long dext1, long long sext1, ERR_PROTYPE)
{
  signed char *pb = (signed char *)db;
  long long sgsize = 1, dgsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  sgsize = sext0 * sext1;
  dgsize = dext0 * dext1;
  if (dext0 != sext0 || dext1 != sext1) {
    conformable = -1;
  }

  if (conformable != 1 && dgsize >= sgsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_22v_i8 */

__ATTRIBUTES int
pgf90_conformable_33v_i8(void *db, long long dext0, long long sext0,
                         long long dext1, long long sext1, long long dext2,
                         long long sext2, ERR_PROTYPE)
{
  signed char *pb = (signed char *)db;
  long long sgsize = 1, dgsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  sgsize = sext0 * sext1 * sext2;
  dgsize = dext0 * dext1 * dext2;
  if (dext0 != sext0 || dext1 != sext1 || dext2 != sext2) {
    conformable = -1;
  }

  if (conformable != 1 && dgsize >= sgsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_33v_i8 */

__ATTRIBUTES int
pgf90_conformable_nnv_i8(void *db, long long ndim, long long dext0,
                         long long sext0, long long dext1, long long sext1,
                         long long dext2, long long sext2, long long dext3,
                         long long sext3, long long dext4, long long sext4,
                         long long dext5, long long sext5, long long dext6,
                         long long sext6, ERR_PROTYPE)
{
  long long sextent[F90_DESC_MAXDIMS];
  long long dextent[F90_DESC_MAXDIMS];
  signed char *pb = (signed char *)db;
  int i;
  long long sgsize = 1, dgsize = 1;

  int conformable = 1; /*  1 ==> conformable
                        *  0 ==> not conformable but big enough
                        * -1 --> not conformable, no big enough */
  sextent[0] = sext0;
  sextent[1] = sext1;
  sextent[2] = sext2;
  sextent[3] = sext3;
  sextent[4] = sext4;
  sextent[5] = sext5;
  sextent[6] = sext6;

  dextent[0] = dext0;
  dextent[1] = dext1;
  dextent[2] = dext2;
  dextent[3] = dext3;
  dextent[4] = dext4;
  dextent[5] = dext5;
  dextent[6] = dext6;

  if (!pgf90_allocated(pb, ERR_ARGS)) {
    return -1;
  }

  for (i = 0; i < ndim; i++) {
    sgsize *= sextent[i];
    dgsize *= dextent[i];
    if (sextent[i] != dextent[i]) {
      conformable = -1;
      break;
    }
  }

  if (conformable != 1 && dgsize >= sgsize) {
    conformable = 0;
  }

  if (conformable < 0 && __abort)
    __pgi_error_msg(ERR_ARGS_MSG, ERR_MSG_CONFORMABLE);
  return conformable;
} /* pgf90_conformable_nnv_i8 */

__ATTRIBUTES float
__pgi_conv_h2f(unsigned short h)
{
  float val;
  asm volatile("cvt.f32.f16 %0, %1;"
               : "=f"(val)  : "h"(h));
  return val;
}

__ATTRIBUTES unsigned short
__pgi_conv_f2h_rn(float x)
{
  unsigned short val;
  asm volatile("cvt.rn.f16.f32 %0, %1;"
               : "=h"(val)  : "f"(x));
  return val;
}

__ATTRIBUTES unsigned short
__pgi_conv_d2h_rn(double x)
{
  unsigned short val;
  asm volatile("cvt.rn.f16.f64 %0, %1;"
               : "=h"(val)  : "d"(x));
  return val;
}

__ATTRIBUTES int
__pgi_conv_h2i(unsigned short x)
{
  int val;
  asm volatile("cvt.rzi.s32.f16 %0, %1;"
               : "=r"(val)  : "h"(x));
  return val;
}

__ATTRIBUTES unsigned int
__pgi_conv_h2u(unsigned short x)
{
  unsigned int val;
  asm volatile("cvt.rzi.u32.f16 %0, %1;"
               : "=r"(val)  : "h"(x));
  return val;
}

__ATTRIBUTES long long
__pgi_conv_h2k(unsigned short x)
{
  long long val;
  asm volatile("cvt.rzi.s64.f16 %0, %1;"
               : "=l"(val)  : "h"(x));
  return val;
}

__ATTRIBUTES unsigned long long
__pgi_conv_h2uk(unsigned short x)
{
  unsigned long long val;
  asm volatile("cvt.rzi.u64.f16 %0, %1;"
               : "=l"(val)  : "h"(x));
  return val;
}

__ATTRIBUTES unsigned short
__pgi_conv_i2h(int x)
{
  unsigned short val;
  asm volatile("cvt.rn.f16.s32 %0, %1;"
               : "=h"(val)  : "r"(x));
  return val;
}

__ATTRIBUTES unsigned short
__pgi_conv_u2h(unsigned int x)
{
  unsigned short val;
  asm volatile("cvt.rn.f16.u32 %0, %1;"
               : "=h"(val)  : "r"(x));
  return val;
}

__ATTRIBUTES unsigned short
__pgi_conv_k2h(long long x)
{
  unsigned short val;
  asm volatile("cvt.rn.f16.s64 %0, %1;"
               : "=h"(val)  : "l"(x));
  return val;
}

__ATTRIBUTES unsigned short
__pgi_conv_uk2h(unsigned long long x)
{
  unsigned short val;
  asm volatile("cvt.rn.f16.u64 %0, %1;"
               : "=h"(val)  : "l"(x));
  return val;
}

#if defined(__NVHPC_AT_LEAST_PASCAL)
__ATTRIBUTES unsigned short
__pgi_half_add_rn(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("add.f16 %0, %1, %2;"
               : "=h"(val)  : "h"(a), "h"(b));
  return val;
}

__ATTRIBUTES unsigned short
__pgi_half_sub_rn(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("sub.f16 %0, %1, %2;"
               : "=h"(val)  : "h"(a), "h"(b));
  return val;
}

__ATTRIBUTES unsigned short
__pgi_half_mul_rn(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("mul.f16 %0, %1, %2;"
               : "=h"(val)  : "h"(a), "h"(b));
  return val;
}

__ATTRIBUTES unsigned short
__pgi_half_div_rn(unsigned short a, unsigned short b)
{
  float fa, fb, fv, rcp;
  unsigned short val, abs;
  const unsigned short den = 0x008f;

  fa = __pgi_conv_h2f(a);
  fb = __pgi_conv_h2f(b);

  asm volatile("rcp.approx.f32 %0, %1;" : "=f"(rcp)  : "f"(fb));

  fv = rcp * fa;
  val = __pgi_conv_f2h_rn(fv);
  abs = val & 0x7fff;
  if ((abs < den) && (!(abs == 0x0))) {
    float err = fmaf(-fb, fv, fa);
    fv = fmaf(rcp, err, fv);
    val = __pgi_conv_f2h_rn(fv);
  }
  return val;
}

__ATTRIBUTES unsigned short
__pgi_half_neg(unsigned short a)
{
  return __pgi_half_sub_rn(0, a);
}

__ATTRIBUTES unsigned short
__pgi_half_sin_rn_approx(unsigned short h)
{
  float r1;
  r1 = sinf(__pgi_conv_h2f(h));
  return __pgi_conv_f2h_rn(r1);
}

__ATTRIBUTES int
__pgi_half_cmp_eq_ord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.eq.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_ne_ord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.ne.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_lt_ord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.lt.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_le_ord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.le.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_gt_ord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.gt.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_ge_ord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.ge.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_eq_unord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.equ.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_ne_unord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.neu.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_lt_unord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.ltu.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_le_unord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.leu.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_gt_unord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.gtu.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_cmp_ge_unord(unsigned short a, unsigned short b)
{
  unsigned short val;
  asm volatile("{ .reg .pred __$temp3;\n" \
                "  setp.geu.f16  __$temp3, %1, %2;\n" \
                "  selp.u16 %0, 1, 0, __$temp3;}"
                : "=h"(val)  : "h"(a), "h"(b));
  return val ? -1 : 0;
}

__ATTRIBUTES int
__pgi_half_isnan(unsigned short val)
{
  int significand;
  if (((val & 0x7C00) == 0x7C00) && ((val & 0x3FF) != 0)) {
    return -1;
  }
  return 0;
}

__ATTRIBUTES unsigned short
__pgi_half_max(unsigned short a, unsigned short b)
{
  /* implementation adheres to IEEE 754-2008 specification
   * for maxNum which says that if one of the operands is NaN,
   * the non-NaN value must be returned */
  if (__pgi_half_isnan(a) != 0) {
    return b;
  } else if (__pgi_half_isnan(b) != 0) {
    return a;
  } else {
    int cmpres = __pgi_half_cmp_gt_ord(a, b);
    return cmpres != 0 ? a : b;
  }
}

__ATTRIBUTES unsigned short
__pgi_half_min(unsigned short a, unsigned short b)
{
  /* implementation adheres to IEEE 754-2008 specification
   * for minNum which says that if one of the operands is NaN,
   * the non-NaN value must be returned */
  if (__pgi_half_isnan(a) != 0) {
    return b;
  } else if (__pgi_half_isnan(b) != 0) {
    return a;
  } else {
    int cmpres = __pgi_half_cmp_lt_ord(a, b);
    return cmpres != 0 ? a : b;
  }
}

__ATTRIBUTES unsigned short
__pgi_half_asin_rn(unsigned short h)
{
  float r1;
  r1 = asinf(__pgi_conv_h2f(h));
  return __pgi_conv_f2h_rn(r1);
}

__ATTRIBUTES unsigned short
__pgi_half_acos_rn(unsigned short h)
{
  float r1;
  r1 = acosf(__pgi_conv_h2f(h));
  return __pgi_conv_f2h_rn(r1);
}

__ATTRIBUTES unsigned short
__pgi_half_atan_rn(unsigned short h)
{
  float r1;
  r1 = atanf(__pgi_conv_h2f(h));
  return __pgi_conv_f2h_rn(r1);
}

__ATTRIBUTES unsigned short
__pgi_half_tan_rn(unsigned short h)
{
  float r1;
  r1 = tanf(__pgi_conv_h2f(h));
  return __pgi_conv_f2h_rn(r1);
}

__ATTRIBUTES unsigned short
__pgi_half_sinh_rn(unsigned short h)
{
  float r1;
  r1 = sinhf(__pgi_conv_h2f(h));
  return __pgi_conv_f2h_rn(r1);
}

__ATTRIBUTES unsigned short
__pgi_half_cosh_rn(unsigned short h)
{
  float r1;
  r1 = coshf(__pgi_conv_h2f(h));
  return __pgi_conv_f2h_rn(r1);
}

__ATTRIBUTES unsigned short
__pgi_half_tanh_rn(unsigned short h)
{
  float r1;
  r1 = tanhf(__pgi_conv_h2f(h));
  return __pgi_conv_f2h_rn(r1);
}

__ATTRIBUTES unsigned short
__pgi_half_atan2_rn(unsigned short h1, unsigned short h2)
{
  float r1;
  r1 = atan2f(__pgi_conv_h2f(h1), __pgi_conv_h2f(h2));
  return __pgi_conv_f2h_rn(r1);
}

__ATTRIBUTES unsigned short
__pgi_half_abs_rn(unsigned short h)
{
  return (unsigned short)(h & 0x7FFF);
}
#endif /* __NVHPC_AT_LEAST_PASCAL */

/* print a float */
__ATTRIBUTES int
__pgf90io_sc_hf_ldw(unsigned short hf, int ftype, signed char *p)
{
  int j, k, len;
  float f = __pgi_conv_h2f(hf);
  j = p[0];
  if ((j + 9 >= 83) || p[84]) {
    p[j] = '\n';
    p[j + 1] = '\0';
    __pgi_print_string(&p[1]);
    for (k = 1; k < 83; k++)
      p[k] = ' ';
    j = 2;
    p[0] = j;
    p[83] = '\0';
    p[84] = 0;
  }
  if (j != 2)
    j++;
  len = __pgf90io_sc_fp_ldw(f, p + j, 4, 0, 9);
  for (; len <= 9; len++)
    p[j + len] = ' ';
  p[0] = j + 10;
  return 0;
}

__ATTRIBUTES void
__c_bzero(int n, signed char *ptr)
{
  memset(ptr, 0, n);
}

#define DEFCXCOPY(cname,ctype) \
__ATTRIBUTES void \
cname(int n, ctype *dest, ctype *src) \
{ \
  if (dest + n <= src || src + n <= dest) { \
    memcpy(dest, src, n * sizeof(*src)); \
  } else { \
    int i; \
    for (i = 0; i < n; i++) { \
      dest[i] = src[i]; \
    } \
  } \
}

DEFCXCOPY(__c_bcopy,char)
DEFCXCOPY(__c_hcopy,short)
DEFCXCOPY(__c_wcopy,int)
DEFCXCOPY(__c_dcopy,long long)

#define DEFCMCOPY(cname,ctype) \
__ATTRIBUTES void \
cname(ctype *dest, ctype *src, size_t n) \
{ \
  if (dest + n <= src || src + n <= dest) { \
    memcpy(dest, src, n * sizeof(*src)); \
  } else { \
    size_t i; \
 \
    for (i = 0; i < n; i++) { \
      dest[i] = src[i]; \
    } \
  } \
}

DEFCMCOPY(__c_mcopy1,char)
DEFCMCOPY(__c_mcopy2,short)
DEFCMCOPY(__c_mcopy4,int)
DEFCMCOPY(__c_mcopy8,long long)


#define DEFCMCOPYBWD(cname,ctype) \
__ATTRIBUTES void \
cname(ctype *dest, ctype *src, size_t n) \
{ \
  ctype *ss_start = src - n + 1; \
  ctype *dd_start = dest - n + 1; \
 \
  if (dest + 1 <= ss_start || src + 1 <= ss_start) { \
    memcpy(dd_start, ss_start, n * sizeof(*src)); \
  } else { \
    size_t i; \
 \
    for (i = 0; i >= 1-n; i--) { \
      dest[i] = src[i]; \
    } \
  } \
}

DEFCMCOPYBWD(__c_mcopy1_bwd,char)
DEFCMCOPYBWD(__c_mcopy2_bwd,short)
DEFCMCOPYBWD(__c_mcopy4_bwd,int)
DEFCMCOPYBWD(__c_mcopy8_bwd,long long)

typedef struct {
  long long x[2];
} int128;

__ATTRIBUTES void
__c_mcopy16(long long *dest, long long *src, size_t n)
{
  int128 *dd = (int128 *)dest;
  int128 *ss = (int128 *)src;

  if (dd + n <= ss || ss + n <= dd) {
    memcpy(dd, ss, n * sizeof(*ss));
  } else {
    size_t i;

    for (i = 0; i < n; i++) {
      dd[i] = ss[i];
    }
  }
}

__ATTRIBUTES void
__c_mcopy16_bwd(long long *dest, long long *src, size_t n)
{
  int128 *dd = (int128 *)dest;
  int128 *ss = (int128 *)src;

  // The compiler provides pointers to the last element of each region.
  // Adjust to the first elements before deciding to call memcpy.
  int128 *ss_start = ss - n + 1;
  int128 *dd_start = dd - n + 1;

  if (dd + 1 <= ss_start || ss + 1 <= dd_start) {
    memcpy(dd_start, ss_start, n * sizeof(*ss));
  } else {
    size_t i;

    for (i = 0; i >= 1-n; i--) {
      dd[i] = ss[i];
    }
  }
}

__ATTRIBUTES void
__c_mset1(char *dest, int value, long n)
{
  memset(dest, value, n);
}

__ATTRIBUTES void
__c_mset2(short *dest, int value, long n)
{
  long i;

  for (i = 0; i < n; ++i) {
    dest[i] = value;
  }
}

__ATTRIBUTES void
__c_mset4(int *dest, int value, long n)
{
  long i;

  for (i = 0; i < n; ++i) {
    dest[i] = value;
  }
}

__ATTRIBUTES void
__c_mset8(long *dest, long value, long n)
{
  long i;

  for (i = 0; i < n; ++i) {
    dest[i] = value;
  }
}

__ATTRIBUTES void
__c_mset16(long long *dest, long long value1, long long value2, long n)
{
  long i;

  for (i = 0; i < 2*n; i += 2) {
    dest[i]   = value1;
    dest[i+1] = value2;
  }
}

#define DEFMZERO(cname,sz,ctype) \
__ATTRIBUTES void \
cname##sz(ctype *dest, size_t n) \
{ \
  memset(dest, 0, sz*n); \
}

DEFMZERO(__c_mzero,1,char)
DEFMZERO(__c_mzero,2,short)
DEFMZERO(__c_mzero,4,int)
DEFMZERO(__c_mzero,8,long)
DEFMZERO(__c_mzero,16,long long)

/* Delete this function later as the its real one is implemented in nvomp rtl.
 * It is added as workaround for fs25572
 * */
__ATTRIBUTES int
__kmpc_global_thread_num(void *loc)
{
  return 0;
}


/*
 * double double add
 *
 * When we are doing double to double double High Precision Reduction Sums
 *  we use "double double precision"
 * "double double precision" arithmatic is different from IEEE-754 floating point standard (Quadruple-precision).
 * A "double double" number is represented with sum of 2 double variables:
 *  https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Double-double_arithmetic
 * and they have 106-bit significand that makes them slightly less precise than IEEE-754.
 * However, "double double" arithmetic are faster than IEEE-754 arithmetic.
*/
// Adds 2 double double number, and returns a double double
__ATTRIBUTES void
__pgi_add_dd_dd_cuda(double *dd1_h, double *dd1_l, double *dd2_h, double *dd2_l)
{
  double xh,xl,yh,yl;
  xh=*dd1_h;xl=*dd1_l; yh =*dd2_h;yl=*dd2_l;
  double H, h, T, t, S, s, e, f;
  S = xh+yh; T = xl+yl; e = S-xh; f = T-xl; s = S-e; t = T-f;
  s = (yh-e)+(xh-s); t = (yl-f)+(xl-t);
  e = s+T; H = S+e; h = e+(S-H); e = t+h;
  *dd1_h = H+e; *dd1_l= e+(H-*dd1_h);
}

// Adds a double double and a double, and returns a double double
__ATTRIBUTES void
__pgi_add_dd_d_cuda(double *dd_h, double *dd_l, double *d)
{
  double yhi,ylo,x;
  x = *d; yhi = *dd_h; ylo = *dd_l;
  double H, h, S, s, e;
  S = x+yhi; e = S-x; s = S-e;
  s = (yhi-e)+(x-s);
  e = s+ylo;
  H = S+e; h = e+(S-H);
  *dd_h = H+h; *dd_l = h+(H-*dd_h);
}

// Adds a double and double double number, and returns a double
__ATTRIBUTES void
__pgi_add_d_dd_cuda(double *d, double *dd_h, double *dd_l)
{
  double yhi,ylo,x;
  x = *d; yhi = *dd_h; ylo = *dd_l;
  double H, h, S, s, e;
  S = x+yhi; e = S-x; s = S-e;
  s = (yhi-e)+(x-s);
  e = s+ylo;
  H = S+e; h = e+(S-H);
  *d = H+h;
  // = h+(H-*dd_h); to convert double double to double, we ignore lower precision part
}

/* IEEE Arithmetic */

#define ieee_positive_zero 0
#define ieee_negative_zero 1
#define ieee_positive_denormal 2
#define ieee_negative_denormal 3
#define ieee_positive_normal 4
#define ieee_negative_normal 5
#define ieee_positive_inf 6
#define ieee_negative_inf 7
#define ieee_signaling_nan 8
#define ieee_quiet_nan 9
#define ieee_other_value 15

__ATTRIBUTES int
__pgi_ieee_class_dev_r2(signed char *x)
{
  unsigned int iexp, imant;
  unsigned int ix = (unsigned int) *((unsigned short *) x);
  int ieee_classr2;
  iexp = ix >> 10;
  iexp = iexp & 0x1f;
  if (iexp == 0) {
    if (ix == 0) {
      ieee_classr2 = ieee_positive_zero;
    } else if ((ix & 0x7fff) == 0) {
      ieee_classr2 = ieee_negative_zero;
    } else if ((ix & 0x8000) == 0) {
      ieee_classr2 = ieee_positive_denormal;
    } else {
      ieee_classr2 = ieee_negative_denormal;
    }
  } else if (iexp < 31) {
    if ((ix & 0x8000) == 0) {
      ieee_classr2 = ieee_positive_normal;
    } else {
      ieee_classr2 = ieee_negative_normal;
    }
  } else {
    imant = ix & 0x3ff;
    if (imant == 0) {
      if ((ix & 0x8000) == 0) {
        ieee_classr2 = ieee_positive_inf;
      } else {
        ieee_classr2 = ieee_negative_inf;
      }
    } else if ((ix & 0x200) != 0) {
      ieee_classr2 = ieee_quiet_nan;
    } else {
      ieee_classr2 = ieee_signaling_nan;
    }
  }
  return ieee_classr2;
}

__ATTRIBUTES int
__pgi_ieee_class_dev_r4(signed char *x)
{
  unsigned int iexp, imant;
  unsigned int ix = *((int *) x);
  int ieee_classr4;
  iexp = ix >> 23;
  iexp = iexp & 0xff;
  if (iexp == 0) {
    if (ix == 0) {
      ieee_classr4 = ieee_positive_zero;
    } else if ((ix & 0x7fffffff) == 0) {
      ieee_classr4 = ieee_negative_zero;
    } else if ((ix & 0x80000000) == 0) {
      ieee_classr4 = ieee_positive_denormal;
    } else {
      ieee_classr4 = ieee_negative_denormal;
    }
  } else if (iexp < 255) {
    if ((ix & 0x80000000) == 0) {
      ieee_classr4 = ieee_positive_normal;
    } else {
      ieee_classr4 = ieee_negative_normal;
    }
  } else {
    imant = ix & 0x7fffff;
    if (imant == 0) {
      if ((ix & 0x80000000) == 0) {
        ieee_classr4 = ieee_positive_inf;
      } else {
        ieee_classr4 = ieee_negative_inf;
      }
    } else if ((ix & 0x400000) != 0) {
      ieee_classr4 = ieee_quiet_nan;
    } else {
      ieee_classr4 = ieee_signaling_nan;
    }
  }
  return ieee_classr4;
}

__ATTRIBUTES int
__pgi_ieee_class_dev_r8(signed char *x)
{
  unsigned int ix, iy, iexp, imant;
  int ieee_classr8;
  unsigned int *iz = (unsigned int *)x;
  ix = iz[1];  iy = iz[0];
  iexp = ix >> 20;
  iexp = iexp & 0x7ff;
  if (iexp == 0) {
    if ((ix == 0) && (iy == 0)) {
      ieee_classr8 = ieee_positive_zero;
    } else if (((ix & 0x7fffffff) == 0) && (iy == 0)) {
      ieee_classr8 = ieee_negative_zero;
    } else if ((ix & 0x80000000) == 0) {
      ieee_classr8 = ieee_positive_denormal;
    } else {
      ieee_classr8 = ieee_negative_denormal;
    }
  } else if (iexp < 2047) {
    if ((ix & 0x80000000) == 0) {
      ieee_classr8 = ieee_positive_normal;
    } else {
      ieee_classr8 = ieee_negative_normal;
    }
  } else {
    imant = (ix & 0xfffff) | iy;
    if (imant == 0) {
      if ((ix & 0x80000000) == 0) {
        ieee_classr8 = ieee_positive_inf;
      } else {
        ieee_classr8 = ieee_negative_inf;
      }
    } else if ((ix & 0x80000) != 0) {
      ieee_classr8 = ieee_quiet_nan;
    } else {
      ieee_classr8 = ieee_signaling_nan;
    }
  }
  return ieee_classr8;
}

/* ieee_copy_sign */
__ATTRIBUTES unsigned short
__pgi_ieee_copy_sign_dev_r2(signed char *x, signed char *y)
{
  unsigned short ix = *((unsigned short *) x);
  unsigned short iy = *((unsigned short *) y);
  return (unsigned short)((ix & 0x7FFF) | (iy & 0x8000));
}

__ATTRIBUTES float
__pgi_ieee_copy_sign_dev_r4(signed char *x, signed char *y)
{
  unsigned int ix = *((unsigned int *) x);
  unsigned int iy = *((unsigned int *) y);
  unsigned int iz =  (ix & 0x7FFFFFFF) | (iy & 0x80000000);
  return __int_as_float((int)iz);
}

__ATTRIBUTES double
__pgi_ieee_copy_sign_dev_r8(signed char *x, signed char *y)
{
  unsigned int iz[2];
  unsigned int *ix = (unsigned int *) x;
  unsigned int *iy = (unsigned int *) y;
  iz[0] =  ix[0];
  iz[1] =  (ix[1] & 0x7FFFFFFF) | (iy[1] & 0x80000000);
  return __longlong_as_double(*((unsigned long long *) iz));
}

/* ieee_is_finite */
__ATTRIBUTES int
__pgi_ieee_is_finite_dev_r2(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r2(x);
  if (classtype < 6) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES int
__pgi_ieee_is_finite_dev_r4(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r4(x);
  if (classtype < 6) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES int
__pgi_ieee_is_finite_dev_r8(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r8(x);
  if (classtype < 6) {
    return 1;
  }
  return 0;
}

/* ieee_is_nan */
__ATTRIBUTES int
__pgi_ieee_is_nan_dev_r2(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r2(x);
  if ((classtype == 8) || (classtype == 9)) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES int
__pgi_ieee_is_nan_dev_r4(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r4(x);
  if ((classtype == 8) || (classtype == 9)) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES int
__pgi_ieee_is_nan_dev_r8(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r8(x);
  if ((classtype == 8) || (classtype == 9)) {
    return 1;
  }
  return 0;
}

/* ieee_is_negative */
__ATTRIBUTES int
__pgi_ieee_is_negative_dev_r2(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r2(x);
  if ((classtype < 8) && ((classtype & 1) == 1)) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES int
__pgi_ieee_is_negative_dev_r4(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r4(x);
  if ((classtype < 8) && ((classtype & 1) == 1)) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES int
__pgi_ieee_is_negative_dev_r8(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r8(x);
  if ((classtype < 8) && ((classtype & 1) == 1)) {
    return 1;
  }
  return 0;
}

/* ieee_is_normal */
__ATTRIBUTES int
__pgi_ieee_is_normal_dev_r2(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r2(x);
  if ((classtype < 6) && ((classtype & 2) == 0)) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES int
__pgi_ieee_is_normal_dev_r4(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r4(x);
  if ((classtype < 6) && ((classtype & 2) == 0)) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES int
__pgi_ieee_is_normal_dev_r8(signed char *x)
{
  int classtype = __pgi_ieee_class_dev_r8(x);
  if ((classtype < 6) && ((classtype & 2) == 0)) {
    return 1;
  }
  return 0;
}

/* ieee_value */
__ATTRIBUTES unsigned short
__pgi_ieee_value_dev_r2(signed char *x, signed char *cl)
{
  int ix, ic;
  ix = (int) *((signed short *) x);
  ic = (int) *((signed short *) cl);
  if (ic == ieee_positive_zero)     ix = 0x0000;
  if (ic == ieee_negative_zero)     ix = 0x8000;
  if (ic == ieee_positive_denormal) ix = 0x0200;
  if (ic == ieee_negative_denormal) ix = 0x8200;
  if (ic == ieee_positive_normal)   ix = 0x3c00;
  if (ic == ieee_negative_normal)   ix = 0xbc00;
  if (ic == ieee_positive_inf)      ix = 0x7c00;
  if (ic == ieee_negative_inf)      ix = 0xfc00;
  if (ic == ieee_signaling_nan)     ix = 0x7d00;
  if (ic == ieee_quiet_nan)         ix = 0x7e00;
  return ((unsigned short) ix);
}

__ATTRIBUTES float
__pgi_ieee_value_dev_r4(signed char *x, signed char *cl)
{
  int ix, ic;
  ix = *((int *) x);
  ic = *((int *) cl);
  if (ic == ieee_positive_zero)     ix = 0x00000000;
  if (ic == ieee_negative_zero)     ix = 0x80000000;
  if (ic == ieee_positive_denormal) ix = 0x00400000;
  if (ic == ieee_negative_denormal) ix = 0x80400000;
  if (ic == ieee_positive_normal)   ix = 0x3f800000;
  if (ic == ieee_negative_normal)   ix = 0xbf800000;
  if (ic == ieee_positive_inf)      ix = 0x7f800000;
  if (ic == ieee_negative_inf)      ix = 0xff800000;
  if (ic == ieee_signaling_nan)     ix = 0x7fa00000;
  if (ic == ieee_quiet_nan)         ix = 0x7fc00000;
  return __int_as_float(ix);
}

__ATTRIBUTES double
__pgi_ieee_value_dev_r8(signed char *x, signed char *cl)
{
  int ix, ic;
  unsigned int *iz = (unsigned int *)x;
  ix = iz[1];
  ic = *((int *) cl);
  if (ic == ieee_positive_zero)     ix = 0x00000000;
  if (ic == ieee_negative_zero)     ix = 0x80000000;
  if (ic == ieee_positive_denormal) ix = 0x00080000;
  if (ic == ieee_negative_denormal) ix = 0x80080000;
  if (ic == ieee_positive_normal)   ix = 0x3ff00000;
  if (ic == ieee_negative_normal)   ix = 0xbff00000;
  if (ic == ieee_positive_inf)      ix = 0x7ff00000;
  if (ic == ieee_negative_inf)      ix = 0xfff00000;
  if (ic == ieee_signaling_nan)     ix = 0x7ff40000;
  if (ic == ieee_quiet_nan)         ix = 0x7ff80000;
  iz[1] = ix;  iz[0] = 0;
  return __longlong_as_double(*((long long *) iz));
}

/* ieee_logb */
__ATTRIBUTES unsigned short
__pgi_ieee_logb_dev_r2(signed char *x)
{
  unsigned int iexp, imant, ibitp;
  int ix = (int) *((unsigned short *) x);
  iexp = ix >> 10;
  iexp = iexp & 0x1f;
  if ((ix & 0x7fff) == 0) {
    ix = ieee_negative_inf;
    return __pgi_ieee_value_dev_r2(x, (signed char *) &ix);
  } else if (iexp == 0) {
    imant = ix & 0x3ff;
    ibitp = 0x0400;
    while (imant < ibitp) {
      ibitp = ibitp >> 1;
      iexp = iexp - 1;
    }
    return (__pgi_conv_f2h_rn((float) iexp - 14));
  } else if (iexp == 31) {
    return *((unsigned short *) x);
  } else {
    return (__pgi_conv_f2h_rn((float) iexp - 15));
  }
}

__ATTRIBUTES float
__pgi_ieee_logb_dev_r4(signed char *x)
{
  unsigned int iexp, imant, ibitp;
  int ix = *((int *) x);
  iexp = ix >> 23;
  iexp = iexp & 0xff;
  if ((ix & 0x7fffffff) == 0) {
    ix = ieee_negative_inf;
    return __pgi_ieee_value_dev_r4(x, (signed char *) &ix);
  } else if (iexp == 0) {
    imant = ix & 0x7fffff;
    ibitp = 0x00800000;
    while (imant < ibitp) {
      ibitp = ibitp >> 1;
      iexp = iexp - 1;
    }
    return ((float) iexp - 126);
  } else if (iexp == 255) {
    return __int_as_float((int) ix);
  } else {
    return ((float) iexp - 127);
  }
}

__ATTRIBUTES double
__pgi_ieee_logb_dev_r8(signed char *x)
{
  unsigned int ix, iy, iexp, imant, ibitp;
  unsigned int *iz = (unsigned int *)x;
  ix = iz[1];  iy = iz[0];
  iexp = ix >> 20;
  iexp = iexp & 0x7ff;
  if (((ix & 0x7fffffff) == 0) && (iy == 0)) {
    ix = ieee_negative_inf;
    return __pgi_ieee_value_dev_r8(x, (signed char *) &ix);
  } else if (iexp == 0) {
    imant = ix & 0x0fffff;
    if (imant != 0) {
      ibitp = 0x00100000;
      while (imant < ibitp) {
        ibitp = ibitp >> 1;
        iexp = iexp - 1;
      }
    } else {
      iexp = iexp - 20;
      imant = (iy >> 12) & 0x0fffff;
      if (imant != 0) {
        ibitp = 0x00100000;
        while (imant < ibitp) {
          ibitp = ibitp >> 1;
          iexp = iexp - 1;
        }
      } else {
        iexp = iexp - 20;
        imant = iy & 0x0fff;
        ibitp = 0x01000;
        while (imant < ibitp) {
          ibitp = ibitp >> 1;
          iexp = iexp - 1;
        }
      }
    }
    return ((double) iexp - 1022);
  } else if (iexp == 2047) {
    return __longlong_as_double(*((long long *) x));
  } else {
    return ((double) iexp - 1023);
  }
}

/* ieee_next_after */
__ATTRIBUTES float
__pgi_ieee_next_after_dev_r4(signed char *x, signed char *y)
{
  return nextafterf(*((float *)x), *((float *)y));
}

__ATTRIBUTES double
__pgi_ieee_next_after_dev_r8(signed char *x, signed char *y)
{
  return nextafter(*((double *)x), *((double *)y));
}

/* ieee_rem */
__ATTRIBUTES float
__pgi_ieee_rem_dev_4x4(signed char *x, signed char *y)
{
  return remainderf(*((float *)x), *((float *)y));
}

__ATTRIBUTES double
__pgi_ieee_rem_dev_4x8(signed char *x, signed char *y)
{
  return remainder(((double)*((float *)x)), *((double *)y));
}

__ATTRIBUTES double
__pgi_ieee_rem_dev_8x4(signed char *x, signed char *y)
{
  return remainder(*((double *)x), ((double)*((float *)y)));
}

__ATTRIBUTES double
__pgi_ieee_rem_dev_8x8(signed char *x, signed char *y)
{
  return remainder(*((double *)x), *((double *)y));
}

/* ieee_rint */
__ATTRIBUTES float
__pgi_ieee_rint_dev_r4(signed char *x)
{
  return nearbyintf(*((float *)x));
}

__ATTRIBUTES double
__pgi_ieee_rint_dev_r8(signed char *x)
{
  return nearbyint(*((double *)x));
}

/* ieee_scalb */
__ATTRIBUTES float
__pgi_ieee_scalb_dev_r4(signed char *x, signed char *y)
{
  return scalbnf(*((float *)x), *((int *)y));
}

__ATTRIBUTES double
__pgi_ieee_scalb_dev_r8(signed char *x, signed char *y)
{
  return scalbn(*((float *)x), *((int *)y));
}

/* ieee_unordered */
__ATTRIBUTES int
__pgi_ieee_unordered_dev_r4(signed char *x, signed char *y)
{
  int classtype = __pgi_ieee_class_dev_r4(x);
  if ((classtype == 8) || (classtype == 9)) {
    return 1;
  }
  classtype = __pgi_ieee_class_dev_r4(y);
  if ((classtype == 8) || (classtype == 9)) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES int
__pgi_ieee_unordered_dev_r8(signed char *x, signed char *y)
{
  int classtype = __pgi_ieee_class_dev_r8(x);
  if ((classtype == 8) || (classtype == 9)) {
    return 1;
  }
  classtype = __pgi_ieee_class_dev_r8(y);
  if ((classtype == 8) || (classtype == 9)) {
    return 1;
  }
  return 0;
}

/* Overloaded assignment for dealing with the */
/* ieee_class_type struct/type                */
__ATTRIBUTES void
__pgi_ieee_eqct(signed char *x, signed char *y)
{
  *((unsigned int *) x) = *((unsigned int *) y);
}

__ATTRIBUTES int
__pgi_ieee_eqtct(signed char *x, signed char *y)
{
  if (*((unsigned int *) x) == *((unsigned int *) y)) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES int
__pgi_ieee_netct(signed char *x, signed char *y)
{
  if (*((unsigned int *) x) != *((unsigned int *) y)) {
    return 1;
  }
  return 0;
}

__ATTRIBUTES void
pgf90_mmul_real4(int ta, int tb, long long lm, long long ln, long long lk,
    signed char *alpha, signed char *a, long long lda,
    signed char *b, long long ldb, signed char *beta,
    signed char *c, long long ldc)
{
  int i, j, k; 
  float fsum, fva, fvb;
  float *fa = (float *)a;
  float *fb = (float *)b;
  float *fc = (float *)c;
  for (j = 0; j < (int)ln; j++) {
    for (i = 0; i < (int)lm; i++) {
      fsum = 0.0f;
      for (k = 0; k < (int)lk; k++) {
        if (ta == 0) {
          fva = fa[((int)lda)*k+i];
	} else {
          fva = fa[((int)lda)*i+k];
        }
        if (tb == 0) {
          fvb = fb[((int)ldb)*j+k];
	} else {
          fvb = fb[((int)ldb)*k+j];
        }
	fsum += fva * fvb;
      }
      fc[((int)ldc)*j+i] = fsum;
    }
  }
}

__ATTRIBUTES void
pgf90_mmul_real8(int ta, int tb, long long lm, long long ln, long long lk,
    signed char *alpha, signed char *a, long long lda,
    signed char *b, long long ldb, signed char *beta,
    signed char *c, long long ldc)
{
  int i, j, k;
  double fsum, fva, fvb;
  double *fa = (double *)a;
  double *fb = (double *)b;
  double *fc = (double *)c;
  for (j = 0; j < (int)ln; j++) {
    for (i = 0; i < (int)lm; i++) {
      fsum = 0.0;
      for (k = 0; k < (int)lk; k++) {
        if (ta == 0) {
          fva = fa[((int)lda)*k+i];
	} else {
          fva = fa[((int)lda)*i+k];
        }
        if (tb == 0) {
          fvb = fb[((int)ldb)*j+k];
	} else {
          fvb = fb[((int)ldb)*k+j];
        }
	fsum += fva * fvb;
      }
      fc[((int)ldc)*j+i] = fsum;
    }
  }
}

__ATTRIBUTES void
pgf90_mmul_cmplx8(int ta, int tb, long long lm, long long ln, long long lk,
    signed char *alpha, signed char *a, long long lda,
    signed char *b, long long ldb, signed char *beta,
    signed char *c, long long ldc)
{
  int i, j, k; 
  float fsumrr, fsumri, fsumir, fsumii;
  float fvar, fvai, fvbr, fvbi;
  float *fa = (float *)a;
  float *fb = (float *)b;
  float *fc = (float *)c;
  for (j = 0; j < (int)ln; j++) {
    for (i = 0; i < (int)lm; i++) {
      fsumrr = fsumri = fsumir = fsumii = 0.0f;
      for (k = 0; k < (int)lk; k++) {
        if (ta == 0) {
          fvar = fa[((int)lda)*2*k+2*i];
          fvai = fa[((int)lda)*2*k+2*i+1];
	} else {
          fvar = fa[((int)lda)*2*i+2*k];
          fvai = fa[((int)lda)*2*i+2*k+1];
        }
        if (tb == 0) {
          fvbr = fb[((int)ldb)*2*j+2*k];
          fvbi = fb[((int)ldb)*2*j+2*k+1];
	} else {
          fvbr = fb[((int)ldb)*2*k+2*j];
          fvbi = fb[((int)ldb)*2*k+2*j+1];
        }
	fsumrr += fvar * fvbr;
	fsumri += fvar * fvbi;
	fsumir += fvai * fvbr;
	fsumii += fvai * fvbi;
      }
      fc[((int)ldc)*2*j+2*i]   = fsumrr - fsumii;
      fc[((int)ldc)*2*j+2*i+1] = fsumri + fsumir;
    }
  }
}

__ATTRIBUTES void
pgf90_mmul_cmplx16(int ta, int tb, long long lm, long long ln, long long lk,
    signed char *alpha, signed char *a, long long lda,
    signed char *b, long long ldb, signed char *beta,
    signed char *c, long long ldc)
{
  int i, j, k; 
  double fsumrr, fsumri, fsumir, fsumii;
  double fvar, fvai, fvbr, fvbi;
  double *fa = (double *)a;
  double *fb = (double *)b;
  double *fc = (double *)c;
  for (j = 0; j < (int)ln; j++) {
    for (i = 0; i < (int)lm; i++) {
      fsumrr = fsumri = fsumir = fsumii = 0.0;
      for (k = 0; k < (int)lk; k++) {
        if (ta == 0) {
          fvar = fa[((int)lda)*2*k+2*i];
          fvai = fa[((int)lda)*2*k+2*i+1];
	} else {
          fvar = fa[((int)lda)*2*i+2*k];
          fvai = fa[((int)lda)*2*i+2*k+1];
        }
        if (tb == 0) {
          fvbr = fb[((int)ldb)*2*j+2*k];
          fvbi = fb[((int)ldb)*2*j+2*k+1];
	} else {
          fvbr = fb[((int)ldb)*2*k+2*j];
          fvbi = fb[((int)ldb)*2*k+2*j+1];
        }
	fsumrr += fvar * fvbr;
	fsumri += fvar * fvbi;
	fsumir += fvai * fvbr;
	fsumii += fvai * fvbi;
      }
      fc[((int)ldc)*2*j+2*i]   = fsumrr - fsumii;
      fc[((int)ldc)*2*j+2*i+1] = fsumri + fsumir;
    }
  }
}

__ATTRIBUTES void
pgf90_check_deallocate_i8(void *ptr, unsigned long long size,
    const char *varname, const char *filename, unsigned long long lineno)
{
  /* There is nothing to check on the device. */
}
#if !defined(PGI_COMPILE_BITCODE)
#include "nvhpc_utils_runtime.h"
#endif /* !defined(PGI_COMPILE_BITCODE) */
