Discussion:
[theano-users] GPUArrayException
1***@lnmiit.ac.in
2017-11-05 08:28:46 UTC
Permalink
I am getting this error on using simple dense layer model or CNN model
using keras. Please help

GpuArrayException: ('The following error happened while compiling the
node', GpuAdvancedIncSubtensor1_dev20{inplace=True,
set_instead_of_inc=False}(GpuAlloc<None>{memset_0=True}.0, GpuReshape{2}.0,
GpuReshape{1}.0), '\n', u'CUDA kernel compile failure ::\n0001\t#define
local_barrier() __syncthreads()\n0002\t#define WITHIN_KERNEL extern "C"
__device__\n0003\t#define KERNEL extern "C" __global__\n0004\t#define
GLOBAL_MEM /* empty */\n0005\t#define LOCAL_MEM __shared__\n0006\t#define
LOCAL_MEM_ARG /* empty */\n0007\t#ifdef NAN\n0008\t#undef
NAN\n0009\t#endif\n0010\t#define NAN
__int_as_float(0x7fffffff)\n0011\t#ifdef INFINITY\n0012\t#undef
INFINITY\n0013\t#endif\n0014\t#define INFINITY
__int_as_float(0x7f800000)\n0015\t#define LID_0 threadIdx.x\n0016\t#define
LID_1 threadIdx.y\n0017\t#define LID_2 threadIdx.z\n0018\t#define LDIM_0
blockDim.x\n0019\t#define LDIM_1 blockDim.y\n0020\t#define LDIM_2
blockDim.z\n0021\t#define GID_0 blockIdx.x\n0022\t#define GID_1
blockIdx.y\n0023\t#define GID_2 blockIdx.z\n0024\t#define GDIM_0
gridDim.x\n0025\t#define GDIM_1 gridDim.y\n0026\t#define GDIM_2
gridDim.z\n0027\t#define ga_bool unsigned char\n0028\t#define ga_byte
signed char\n0029\t#define ga_ubyte unsigned char\n0030\t#define ga_short
short\n0031\t#define ga_ushort unsigned short\n0032\t#define ga_int
int\n0033\t#define ga_uint unsigned int\n0034\t#define ga_long long
long\n0035\t#define ga_ulong unsigned long long\n0036\t#define ga_float
float\n0037\t#define ga_double double\n0038\t#define ga_half
ga_ushort\n0039\t#define ga_size size_t\n0040\t#define ga_ssize
ptrdiff_t\n0041\t#define load_half(p) __half2float(*(p))\n0042\t#define
store_half(p, v) (*(p) = __float2half_rn(v))\n0043\t#define
GA_DECL_SHARED_PARAM(type, name)\n0044\t#define GA_DECL_SHARED_BODY(type,
name) extern __shared__ type name[];\n0045\t#define GA_WARP_SIZE
warpSize\n0046\t#line 1\n0047\t\n0048\t/*\n0049\t * This is an atomicAdd
that works for doubles since that is not provided\n0050\t * natively by
cuda.\n0051\t */\n0052\t__device__ ga_double atomicAdd(ga_double* address,
ga_double val) {\n0053\t unsigned long long int* address_as_ull
=\n0054\t (unsigned long long
int*)address;\n0055\t unsigned long long int old = *address_as_ull,
assumed;\n0056\t do {\n0057\t assumed = old;\n0058\t old =
atomicCAS(address_as_ull, assumed,\n0059\t
__double_as_longlong(val +\n0060\t
__longlong_as_double(assumed)));\n0061\t } while (assumed !=
old);\n0062\t return
__longlong_as_double(old);\n0063\t}\n0064\t\n0065\t__device__ ga_double
atomicExch(ga_double *address, ga_double val) {\n0066\t return
atomicExch((unsigned long long int *)address,\n0067\t
__double_as_longlong(val));\n0068\t}\n0069\t\n0070\t/*\n0071\t * This is a
version of atomicAdd that works for half-floats. It may\n0072\t * read and
write 2 bytes more than the size of the array if the array\n0073\t * has an
uneven number of elements. The actual value at that spot\n0074\t * will
not be modified.\n0075\t */\n0076\t\n0077\t__device__ ga_half
atomicAdd(ga_half *addr, ga_half val) {\n0078\t ga_uint *base = (ga_uint
*)((ga_size)addr & ~2);\n0079\t ga_uint old, assumed, sum, new_;\n0080\t
old = *base;\n0081\t do {\n0082\t assumed = old;\n0083\t sum =
__float2half_rn(\n0084\t __half2float(val) +\n0085\t
__half2float((ga_half)__byte_perm(old, 0,\n0086\t
((ga_size)addr & 2) ? 0x4432 : 0x4410)));\n0087\t new_ =
__byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);\n0088\t
old = atomicCAS(base, assumed, new_);\n0089\t } while (assumed !=
old);\n0090\t return (ga_half)__byte_perm(old, 0,\n0091\t
((ga_size)addr & 2) ? 0x4432 :
0x4410);\n0092\t}\n0093\t\n0094\t__device__ ga_half atomicExch(ga_half
*addr, ga_half val) {\n0095\t ga_uint *base = (ga_uint *)((ga_size)addr &
~2);\n0096\t ga_uint old, assumed, new_;\n0097\t old = *base;\n0098\t do
{\n0099\t assumed = old;\n0100\t new_ = __byte_perm(old, val,
((ga_size)addr & 2) ? 0x5410 : 0x3254);\n0101\t old = atomicCAS(base,
assumed, new_);\n0102\t } while (assumed != old);\n0103\t return
(ga_half)__byte_perm(old, 0,\n0104\t
((ga_size)addr & 2) ? 0x4432 : 0x4410);\n0105\t}\n0106\t\n0107\t
KERNEL void k_vector_add_fast(const ga_size numRowsX,\n0108\t
const ga_size numColsX,\n0109\t
const ga_ssize stridesX0,\n0110\t
const ga_ssize stridesX1,\n0111\t
ga_float *X,\n0112\t const
ga_size offset_X,\n0113\t const
ga_size numRowsY,\n0114\t const
ga_size numColsY,\n0115\t const
ga_ssize stridesY0,\n0116\t const
ga_ssize stridesY1,\n0117\t ga_float
*Y,\n0118\t const ga_size
offset_Y,\n0119\t const ga_size
numIndices,\n0120\t const ga_ssize
stridesIndices,\n0121\t ga_int
*indices_arr,\n0122\t const ga_size
offset_indices_arr,\n0123\t const int
set_instead_of_inc,\n0124\t ga_int
*err)\n0125\t {\n0126\t X = (ga_float *)(((char
*)X)+offset_X);\n0127\t Y = (ga_float *)(((char
*)Y)+offset_Y);\n0128\t indices_arr = (ga_int *)(((char
*)indices_arr)+offset_indices_arr);\n0129\t for (int i =
(blockIdx.x); i < numIndices; i += gridDim.x)\n0130\t
{\n0131\t for(int j = (threadIdx.x); j < numColsX;j +=
blockDim.x)\n0132\t {\n0133\t
ga_ssize x_row = indices_arr[i * stridesIndices];\n0134\t
if (x_row < 0)\n0135\t x_row +=
numRowsX;\n0136\t ga_ssize y_row = i;\n0137\t
if (x_row < numRowsX && x_row >= 0) {\n0138\t
if (set_instead_of_inc) {\n0139\t
atomicExch(&X[(x_row * stridesX0) + (j * stridesX1)],\n0140\t
Y[(y_row * stridesY0) + (j * stridesY1)]);\n0141\t
} else {\n0142\t
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],\n0143\t
Y[(y_row * stridesY0) + (j * stridesY1)]);\n0144\t
}\n0145\t } else {\n0146\t
*err = 1;\n0147\t }\n0148\t
}\n0149\t }\n0150\t return;\n0151\t
}\n0152\t \n\nCompile log:\nNVRTC compile
log::\ndefault_program(38): error: identifier "__half2float" is
undefined\n\ndefault_program(37): error: identifier "__float2half_rn" is
undefined\n\n2 errors detected in the compilation of
"default_program".\n\n')
--
---
You received this message because you are subscribed to the Google Groups "theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to theano-users+***@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
Frédéric Bastien
2017-11-10 13:18:51 UTC
Permalink
You seem to be mixing Theano and libgpuarray version. Maybe even mixing
different libgpuarray.

Make sure to uninstall all libgpuarray version. Depending of how you
installed it, you can look at this:

http://deeplearning.net/software/libgpuarray/installation.html#step-by-step-install-user-library

Then install the 1.0rc1 version with the instruction on this page:

https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end(gpuarray)
Post by 1***@lnmiit.ac.in
I am getting this error on using simple dense layer model or CNN model
using keras. Please help
GpuArrayException: ('The following error happened while compiling the
node', GpuAdvancedIncSubtensor1_dev20{inplace=True,
set_instead_of_inc=False}(GpuAlloc<None>{memset_0=True}.0, GpuReshape{2}.0,
GpuReshape{1}.0), '\n', u'CUDA kernel compile failure ::\n0001\t#define
local_barrier() __syncthreads()\n0002\t#define WITHIN_KERNEL extern "C"
__device__\n0003\t#define KERNEL extern "C" __global__\n0004\t#define
GLOBAL_MEM /* empty */\n0005\t#define LOCAL_MEM __shared__\n0006\t#define
LOCAL_MEM_ARG /* empty */\n0007\t#ifdef NAN\n0008\t#undef
NAN\n0009\t#endif\n0010\t#define NAN
__int_as_float(0x7fffffff)\n0011\t#ifdef INFINITY\n0012\t#undef
INFINITY\n0013\t#endif\n0014\t#define INFINITY
__int_as_float(0x7f800000)\n0015\t#define LID_0 threadIdx.x\n0016\t#define
LID_1 threadIdx.y\n0017\t#define LID_2 threadIdx.z\n0018\t#define LDIM_0
blockDim.x\n0019\t#define LDIM_1 blockDim.y\n0020\t#define LDIM_2
blockDim.z\n0021\t#define GID_0 blockIdx.x\n0022\t#define GID_1
blockIdx.y\n0023\t#define GID_2 blockIdx.z\n0024\t#define GDIM_0
gridDim.x\n0025\t#define GDIM_1 gridDim.y\n0026\t#define GDIM_2
gridDim.z\n0027\t#define ga_bool unsigned char\n0028\t#define ga_byte
signed char\n0029\t#define ga_ubyte unsigned char\n0030\t#define ga_short
short\n0031\t#define ga_ushort unsigned short\n0032\t#define ga_int
int\n0033\t#define ga_uint unsigned int\n0034\t#define ga_long long
long\n0035\t#define ga_ulong unsigned long long\n0036\t#define ga_float
float\n0037\t#define ga_double double\n0038\t#define ga_half
ga_ushort\n0039\t#define ga_size size_t\n0040\t#define ga_ssize
ptrdiff_t\n0041\t#define load_half(p) __half2float(*(p))\n0042\t#define
store_half(p, v) (*(p) = __float2half_rn(v))\n0043\t#define
GA_DECL_SHARED_PARAM(type, name)\n0044\t#define GA_DECL_SHARED_BODY(type,
name) extern __shared__ type name[];\n0045\t#define GA_WARP_SIZE
warpSize\n0046\t#line 1\n0047\t\n0048\t/*\n0049\t * This is an atomicAdd
that works for doubles since that is not provided\n0050\t * natively by
cuda.\n0051\t */\n0052\t__device__ ga_double atomicAdd(ga_double* address,
ga_double val) {\n0053\t unsigned long long int* address_as_ull
=\n0054\t (unsigned long long
int*)address;\n0055\t unsigned long long int old = *address_as_ull,
assumed;\n0056\t do {\n0057\t assumed = old;\n0058\t old =
atomicCAS(address_as_ull, assumed,\n0059\t
__double_as_longlong(val +\n0060\t
__longlong_as_double(assumed)));\n0061\t } while (assumed !=
old);\n0062\t return
__longlong_as_double(old);\n0063\t}\n0064\t\n0065\t__device__ ga_double
atomicExch(ga_double *address, ga_double val) {\n0066\t return
atomicExch((unsigned long long int *)address,\n0067\t
__double_as_longlong(val));\n0068\t}\n0069\t\n0070\t/*\n0071\t * This is a
version of atomicAdd that works for half-floats. It may\n0072\t * read and
write 2 bytes more than the size of the array if the array\n0073\t * has an
uneven number of elements. The actual value at that spot\n0074\t * will
not be modified.\n0075\t */\n0076\t\n0077\t__device__ ga_half
atomicAdd(ga_half *addr, ga_half val) {\n0078\t ga_uint *base = (ga_uint
*)((ga_size)addr & ~2);\n0079\t ga_uint old, assumed, sum, new_;\n0080\t
old = *base;\n0081\t do {\n0082\t assumed = old;\n0083\t sum =
__float2half_rn(\n0084\t __half2float(val) +\n0085\t
__half2float((ga_half)__byte_perm(old, 0,\n0086\t
((ga_size)addr & 2) ? 0x4432 : 0x4410)));\n0087\t new_ =
__byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);\n0088\t
old = atomicCAS(base, assumed, new_);\n0089\t } while (assumed !=
old);\n0090\t return (ga_half)__byte_perm(old, 0,\n0091\t
0x4410);\n0092\t}\n0093\t\n0094\t__device__ ga_half atomicExch(ga_half
*addr, ga_half val) {\n0095\t ga_uint *base = (ga_uint *)((ga_size)addr &
~2);\n0096\t ga_uint old, assumed, new_;\n0097\t old = *base;\n0098\t do
{\n0099\t assumed = old;\n0100\t new_ = __byte_perm(old, val,
((ga_size)addr & 2) ? 0x5410 : 0x3254);\n0101\t old = atomicCAS(base,
assumed, new_);\n0102\t } while (assumed != old);\n0103\t return
(ga_half)__byte_perm(old, 0,\n0104\t
((ga_size)addr & 2) ? 0x4432 : 0x4410);\n0105\t}\n0106\t\n0107\t
KERNEL void k_vector_add_fast(const ga_size numRowsX,\n0108\t
const ga_size numColsX,\n0109\t
const ga_ssize stridesX0,\n0110\t
const ga_ssize stridesX1,\n0111\t
ga_float *X,\n0112\t const
ga_size offset_X,\n0113\t const
ga_size numRowsY,\n0114\t const
ga_size numColsY,\n0115\t const
ga_ssize stridesY0,\n0116\t const
ga_ssize stridesY1,\n0117\t ga_float
*Y,\n0118\t const ga_size
offset_Y,\n0119\t const ga_size
numIndices,\n0120\t const ga_ssize
stridesIndices,\n0121\t ga_int
*indices_arr,\n0122\t const ga_size
offset_indices_arr,\n0123\t const int
set_instead_of_inc,\n0124\t ga_int
*err)\n0125\t {\n0126\t X = (ga_float *)(((char
*)X)+offset_X);\n0127\t Y = (ga_float *)(((char
*)Y)+offset_Y);\n0128\t indices_arr = (ga_int *)(((char
*)indices_arr)+offset_indices_arr);\n0129\t for (int i =
(blockIdx.x); i < numIndices; i += gridDim.x)\n0130\t
{\n0131\t for(int j = (threadIdx.x); j < numColsX;j +=
blockDim.x)\n0132\t {\n0133\t
ga_ssize x_row = indices_arr[i * stridesIndices];\n0134\t
if (x_row < 0)\n0135\t x_row +=
numRowsX;\n0136\t ga_ssize y_row = i;\n0137\t
if (x_row < numRowsX && x_row >= 0) {\n0138\t
if (set_instead_of_inc) {\n0139\t
atomicExch(&X[(x_row * stridesX0) + (j * stridesX1)],\n0140\t
Y[(y_row * stridesY0) + (j * stridesY1)]);\n0141\t
} else {\n0142\t
atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],\n0143\t
Y[(y_row * stridesY0) + (j * stridesY1)]);\n0144\t
}\n0145\t } else {\n0146\t
*err = 1;\n0147\t }\n0148\t
}\n0149\t }\n0150\t return;\n0151\t
}\n0152\t \n\nCompile log:\nNVRTC compile
log::\ndefault_program(38): error: identifier "__half2float" is
undefined\n\ndefault_program(37): error: identifier "__float2half_rn" is
undefined\n\n2 errors detected in the compilation of
"default_program".\n\n')
--
---
You received this message because you are subscribed to the Google Groups
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an
For more options, visit https://groups.google.com/d/optout.
--
---
You received this message because you are subscribed to the Google Groups "theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to theano-users+***@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
Loading...