ru peng
2018-07-31 13:36:46 UTC
I checked my code many times and made sure that it didn't exist. I don't
know which part of the code caused this error. Can anyone tell me? Any
help, be grateful.
import numpy as np
from theano import tensor
import theano
from collections import OrderedDict
theano.config.floatX = "float32"
profile=False
theano.config.exception_verbosity='high'
theano.config.optimizer='None'
def ortho_weight(ndim):
W = np.random.randn(ndim, ndim)
u, s, v = np.linalg.svd(W) # Singular value decomposition for matrix W of ndim*ndimïŒW=US(V*H)
return u.astype(theano.config.floatX)
def norm_weight(nin, nout=None, scale=0.01, ortho=True):
if nout is None:
nout = nin
if nout == nin and ortho:
W = ortho_weight(nin) # Return a nin*nin size Unitary Matrix after singular value decomposition
else:
W = scale * np.random.randn(nin, nout) # randomly generate a numpy matrix of nin*nout, scale all items inside
return W.astype(theano.config.floatX)
def init_tparams(params):
tparams = OrderedDict()
for kk, pp in params.items():
tparams[kk] = theano.shared(params[kk],name=kk) # shared variables
return tparams
def itemlist(tparams):
return [vv for kk, vv in tparams.items()]
# params
params = OrderedDict() # Create an ordered dictionary
context_mask = np.matrix([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX) # n_steps_src = 7 , n_samples = 5
mask = np.matrix([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX)
D = 1
numPositions = 2*D+1 # 3
nsteps_src = context_mask.shape[0]
nsteps_trg = mask.shape[0]
n_samples = context_mask.shape[1]
n_words = nsteps_trg*n_samples
dim_word = 6
params['Wemb_dec'] = norm_weight(n_words, dim_word) # 35*6
emb = theano.shared(params['Wemb_dec']) # shard variables
emb = emb.reshape([nsteps_trg, n_samples, dim_word])
emb_shifted = tensor.zeros_like(emb)
emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) # Set the first timestps to 0, others no change
emb = emb_shifted
cc_ = 0.01*np.random.randn(nsteps_src,n_samples,4).astype(theano.config.floatX)
dim = cc_.shape[2] # dim = 4
context_mask = theano.shared(context_mask) # shard variables
context_mask1 = context_mask # add to non-seuqences,cause will use <context_mask> when computed <srclen> in scan()
mask = theano.shared(mask)
cc_ = theano.shared(cc_)
# These weight matrix and bias term are to be used for feedforward neural network,Respectively
params['ff_state_W'] = norm_weight(dim, dim, scale=0.01, ortho=True) #
params['ff_state_b'] = np.zeros((dim,)).astype(theano.config.floatX)
params['ff_logit_lstm_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_lstm_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_prev_W'] = norm_weight(dim_word, dim_word, scale=0.01, ortho=False)
params['ff_logit_prev_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_ctx_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_ctx_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_W'] = norm_weight(dim_word, n_words, scale=0.01, ortho=True)
params['ff_logit_b'] = np.zeros((n_words,)).astype(theano.config.floatX)
# These parameters will be used for Gated Recurrent Unit(GRU) in scan().
params['W'] = np.concatenate([norm_weight(dim_word, dim),norm_weight(dim_word, dim)], axis=1) # dim_word*2dim
params['b'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim
params['U'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['Wx'] = norm_weight(dim_word, dim) # dim_word*dim
params['Ux'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['U_nl'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['b_nl'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim
params['Ux_nl'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx_nl'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['Wc'] = norm_weight(dim, dim*2) # dim * 2dim
params['Wcx'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['W_comb_att'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['Wc_att'] = norm_weight(dim) # svd matrices (size :dim*dim)
params['b_att'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['U_att'] = norm_weight(dim, 1) # dim*1
params['c_tt'] = np.zeros((1,)).astype(theano.config.floatX)
tparams = init_tparams(params)
def _slice(_x, n, dim): # slice() for tensor,get [:,n*dim:(n+1)*dim] of tensor
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
ctx_mean = (cc_ * context_mask[:, :, None]).sum(0) / context_mask.sum(0)[:, None]
init_state = tensor.tanh(tensor.dot(ctx_mean,tparams['ff_state_W'])+tparams['ff_state_b']) # Initialization of h_ in _step_slice of scan()
state_belowx = tensor.dot(emb, tparams['Wx']) +tparams['bx'] # state_belowx = emb*tparams['Wx']+tparams['bx']
state_below_ = tensor.dot(emb, tparams['W']) + tparams['b'] # state_below_ = emb*tparams['W']+tparams['b']
def _step_slice(m, m_, x_, xx_, h_, ctx_, alpha_,cc_,context_mask1,
Wc_att, b_att,U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
U_nl, Ux_nl, b_nl, bx_nl):
preact1 = tensor.dot(h_, U)
preact1 += x_
preact1 = tensor.nnet.sigmoid(preact1)
r1 = _slice(preact1, 0, dim)
u1 = _slice(preact1, 1, dim)
preactx1 = tensor.dot(h_, Ux)
preactx1 *= r1
preactx1 += xx_
h1 = tensor.tanh(preactx1)
h1 = u1 * h_ + (1. - u1) * h1
h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ # h1 is source hidden state ,batchsize*dim
srclen = (context_mask1.sum(0,keepdims=True)- 1) # context_mask.sum(0,keepdims=True)-1 Corresponding to source sentence length of this batch
pctx__ = tensor.tanh(tensor.dot(h1, W_comb_att))
pctx__ = tensor.dot(pctx__, U_att) + c_tt # batchsize*1
pstate_ = srclen * tensor.nnet.sigmoid(pctx__.T) + 1 # +1ïŒeos is considered
srcPositions = tensor.floor(pstate_) # batchsize*1
srcPositions = tensor.cast(srcPositions, dtype='int32') # srcPosition is index, so shoud cast to int,like 3.6-->3.0
unmaskedId = tensor.flatnonzero(m) # per timesteps ,take per row element of the source mask matrix as m ,(batchsize-x)
srcPositions = srcPositions[:,unmaskedId] # (batchsize-x)*1
srclen = srclen[:,unmaskedId] # (batchsize-x)*1
startAttnIds = srcPositions - D
indicesAll = startAttnIds.repeat(numPositions, axis=0)
indicesAll += tensor.mgrid[0:numPositions,0:unmaskedId.shape[0]][0] # (batchsize-x)*numPositions
indicesAll = indicesAll.flatten() # 1*(numPositions*(batchsize-x))
# Delete the source sentence index position centered on pstate_ and the window size D, which exceeds 0 or the maximum length of the sentence
includeIds = (indicesAll <= srclen.repeat(numPositions,axis=0).flatten()) & (indicesAll >= 0)
indicesAll = indicesAll[includeIds] # dimensional reductionïŒ1*((numPosition*batchsize-x)-y)
indicesSub = tensor.arange(0,numPositions).repeat(unmaskedId.shape[0]) # Scale-out numPositions times,1*(numPosition*batchsize-x)
indicesSub = indicesSub[includeIds] # 1*((numPosition*batchsize-x)-y)
unmaskedIds = tensor.tile(unmaskedId,numPositions) # Scale-out numPositions times,ïŒ1*(numPosition*batchsize-x)
unmaskedIds = unmaskedIds[includeIds] # 1*((numPosition*batchsize-x)-y)
srcVecsSub = tensor.zeros([numPositions*n_samples,dim]) # 15*3
linearIdSub = indicesSub*n_samples+ unmaskedIds # 1*((numPosition*batchsize-x)-y)
linearIdAll = indicesAll*n_samples+ unmaskedIds # 1*((numPosition*batchsize-x)-y)
cc_ = tensor.reshape(cc_,[nsteps_src * n_samples,dim]) # 35*3
srcVecsSub = tensor.set_subtensor(srcVecsSub[linearIdSub, :], cc_[linearIdAll, :]) # numPositions*n_samples*dim
srcVecsSub = srcVecsSub.reshape([numPositions, n_samples, dim])
e_ij = tensor.dot(srcVecsSub,Wc_att)+b_att # equivalent to eq:W_a*hs+b_a,numPositions * batchSize*dim
e_ij = (h1*e_ij).sum(2) # equivalent to eq:h_t' *(W_a*hs+b_a),numPositions * batchSize
scaleX = (indicesAll - pstate_.repeat(numPositions, axis=0).flatten()[unmaskedIds])/(D/2) # Select the number with the index [unmaskedId] from the vector of size size*numPosition,scaleX.shape == unmaskedId.shape
distWeights = tensor.zeros([numPositions,n_samples])
distWeights = tensor.set_subtensor(distWeights[indicesSub,unmaskedIds],scaleX) # numPositions*batchSize
alpha = e_ij * tensor.exp(-0.5*tensor.square(distWeights))
alpha = alpha - alpha.max(0) # subtract max elements
alpha = tensor.exp(alpha) # numPositions * batchSize
context_mask_ = tensor.zeros([numPositions, n_samples])
context_mask_ = tensor.set_subtensor(context_mask_[indicesSub, unmaskedIds], 1.)
if context_mask_: # context_mask =x_maskïŒnsteps_src * batchSizeïŒbe truncated to numPositions * batchSize
alpha = alpha * context_mask_
alpha_sum = alpha.sum(0,keepdims=True) # âeij
alpha_sum = tensor.switch(alpha_sum,alpha_sum,1.) # if alpua_sum = 0 ->1
alpha = alpha / alpha_sum # (numPositions * batchSize),eij/âeij
# current context,(numPositions*batchSize*dim).sum(0) -->batchSize*dim
ctx_ = (srcVecsSub * alpha[:, :, None]).sum(0)
preact2 = tensor.dot(h1, U_nl) + b_nl
preact2 += tensor.dot(ctx_, Wc)
preact2 = tensor.nnet.sigmoid(preact2)
r2 = _slice(preact2, 0, dim)
u2 = _slice(preact2, 1, dim)
preactx2 = tensor.dot(h1, Ux_nl) + bx_nl
preactx2 *= r2
preactx2 += tensor.dot(ctx_, Wcx)
h2 = tensor.tanh(preactx2)
h2 = u2 * h1 + (1. - u2) * h2
h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
return h2, ctx_, alpha.T
seqs = [context_mask, mask, state_below_, state_belowx]
shared_vars = [tparams['Wc_att'],
tparams['b_att'],
tparams['U'],
tparams['Wc'],
tparams['W_comb_att'],
tparams['U_att'],
tparams['c_tt'],
tparams['Ux'],
tparams['Wcx'],
tparams['U_nl'],
tparams['Ux_nl'],
tparams['b_nl'],
tparams['bx_nl']]
rval, updates = theano.scan(_step_slice,
sequences=seqs,
outputs_info=[init_state,
tensor.alloc(0., n_samples,
cc_.shape[2]),
tensor.alloc(0., n_samples,
cc_.shape[0])],
non_sequences=[cc_,context_mask1]+shared_vars,
name='layers',
n_steps=nsteps_trg,
profile=profile,
strict=True)
proj_h = rval[0]
ctxs = rval[1]
# 3 feedforward neural networks,used to indicate the contribution of proj_h, emb, ctxs to cost
logit_lstm = tensor.dot(proj_h,tparams['ff_logit_lstm_W'])+tparams['ff_logit_lstm_b']
logit_prev = tensor.dot(emb,tparams['ff_logit_prev_W'])+tparams['ff_logit_prev_b']
logit_ctx = tensor.dot(ctxs,tparams['ff_logit_ctx_W'])+tparams['ff_logit_ctx_b']
logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
# a feedforward neural network layer + softmax
logit = tensor.dot(logit,tparams['ff_logit_W'])+tparams['ff_logit_b']
logit_shp = logit.shape
probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1],
logit_shp[2]]))
# cost
cost = -tensor.log(probs.flatten())
cost = cost.reshape([mask.shape[0], mask.shape[1]])
cost = (cost * mask).sum(0)
# make mean(), tensor->scalar
cost = cost.mean()
print('Computing gradient...', end=" ")
grads = tensor.grad(cost, wrt=itemlist(tparams),disconnected_inputs = 'ignore')
print('Done')
print(grads[1].eval())
know which part of the code caused this error. Can anyone tell me? Any
help, be grateful.
import numpy as np
from theano import tensor
import theano
from collections import OrderedDict
theano.config.floatX = "float32"
profile=False
theano.config.exception_verbosity='high'
theano.config.optimizer='None'
def ortho_weight(ndim):
W = np.random.randn(ndim, ndim)
u, s, v = np.linalg.svd(W) # Singular value decomposition for matrix W of ndim*ndimïŒW=US(V*H)
return u.astype(theano.config.floatX)
def norm_weight(nin, nout=None, scale=0.01, ortho=True):
if nout is None:
nout = nin
if nout == nin and ortho:
W = ortho_weight(nin) # Return a nin*nin size Unitary Matrix after singular value decomposition
else:
W = scale * np.random.randn(nin, nout) # randomly generate a numpy matrix of nin*nout, scale all items inside
return W.astype(theano.config.floatX)
def init_tparams(params):
tparams = OrderedDict()
for kk, pp in params.items():
tparams[kk] = theano.shared(params[kk],name=kk) # shared variables
return tparams
def itemlist(tparams):
return [vv for kk, vv in tparams.items()]
# params
params = OrderedDict() # Create an ordered dictionary
context_mask = np.matrix([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX) # n_steps_src = 7 , n_samples = 5
mask = np.matrix([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX)
D = 1
numPositions = 2*D+1 # 3
nsteps_src = context_mask.shape[0]
nsteps_trg = mask.shape[0]
n_samples = context_mask.shape[1]
n_words = nsteps_trg*n_samples
dim_word = 6
params['Wemb_dec'] = norm_weight(n_words, dim_word) # 35*6
emb = theano.shared(params['Wemb_dec']) # shard variables
emb = emb.reshape([nsteps_trg, n_samples, dim_word])
emb_shifted = tensor.zeros_like(emb)
emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) # Set the first timestps to 0, others no change
emb = emb_shifted
cc_ = 0.01*np.random.randn(nsteps_src,n_samples,4).astype(theano.config.floatX)
dim = cc_.shape[2] # dim = 4
context_mask = theano.shared(context_mask) # shard variables
context_mask1 = context_mask # add to non-seuqences,cause will use <context_mask> when computed <srclen> in scan()
mask = theano.shared(mask)
cc_ = theano.shared(cc_)
# These weight matrix and bias term are to be used for feedforward neural network,Respectively
params['ff_state_W'] = norm_weight(dim, dim, scale=0.01, ortho=True) #
params['ff_state_b'] = np.zeros((dim,)).astype(theano.config.floatX)
params['ff_logit_lstm_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_lstm_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_prev_W'] = norm_weight(dim_word, dim_word, scale=0.01, ortho=False)
params['ff_logit_prev_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_ctx_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_ctx_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_W'] = norm_weight(dim_word, n_words, scale=0.01, ortho=True)
params['ff_logit_b'] = np.zeros((n_words,)).astype(theano.config.floatX)
# These parameters will be used for Gated Recurrent Unit(GRU) in scan().
params['W'] = np.concatenate([norm_weight(dim_word, dim),norm_weight(dim_word, dim)], axis=1) # dim_word*2dim
params['b'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim
params['U'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['Wx'] = norm_weight(dim_word, dim) # dim_word*dim
params['Ux'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['U_nl'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['b_nl'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim
params['Ux_nl'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx_nl'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['Wc'] = norm_weight(dim, dim*2) # dim * 2dim
params['Wcx'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['W_comb_att'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['Wc_att'] = norm_weight(dim) # svd matrices (size :dim*dim)
params['b_att'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['U_att'] = norm_weight(dim, 1) # dim*1
params['c_tt'] = np.zeros((1,)).astype(theano.config.floatX)
tparams = init_tparams(params)
def _slice(_x, n, dim): # slice() for tensor,get [:,n*dim:(n+1)*dim] of tensor
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
ctx_mean = (cc_ * context_mask[:, :, None]).sum(0) / context_mask.sum(0)[:, None]
init_state = tensor.tanh(tensor.dot(ctx_mean,tparams['ff_state_W'])+tparams['ff_state_b']) # Initialization of h_ in _step_slice of scan()
state_belowx = tensor.dot(emb, tparams['Wx']) +tparams['bx'] # state_belowx = emb*tparams['Wx']+tparams['bx']
state_below_ = tensor.dot(emb, tparams['W']) + tparams['b'] # state_below_ = emb*tparams['W']+tparams['b']
def _step_slice(m, m_, x_, xx_, h_, ctx_, alpha_,cc_,context_mask1,
Wc_att, b_att,U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
U_nl, Ux_nl, b_nl, bx_nl):
preact1 = tensor.dot(h_, U)
preact1 += x_
preact1 = tensor.nnet.sigmoid(preact1)
r1 = _slice(preact1, 0, dim)
u1 = _slice(preact1, 1, dim)
preactx1 = tensor.dot(h_, Ux)
preactx1 *= r1
preactx1 += xx_
h1 = tensor.tanh(preactx1)
h1 = u1 * h_ + (1. - u1) * h1
h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ # h1 is source hidden state ,batchsize*dim
srclen = (context_mask1.sum(0,keepdims=True)- 1) # context_mask.sum(0,keepdims=True)-1 Corresponding to source sentence length of this batch
pctx__ = tensor.tanh(tensor.dot(h1, W_comb_att))
pctx__ = tensor.dot(pctx__, U_att) + c_tt # batchsize*1
pstate_ = srclen * tensor.nnet.sigmoid(pctx__.T) + 1 # +1ïŒeos is considered
srcPositions = tensor.floor(pstate_) # batchsize*1
srcPositions = tensor.cast(srcPositions, dtype='int32') # srcPosition is index, so shoud cast to int,like 3.6-->3.0
unmaskedId = tensor.flatnonzero(m) # per timesteps ,take per row element of the source mask matrix as m ,(batchsize-x)
srcPositions = srcPositions[:,unmaskedId] # (batchsize-x)*1
srclen = srclen[:,unmaskedId] # (batchsize-x)*1
startAttnIds = srcPositions - D
indicesAll = startAttnIds.repeat(numPositions, axis=0)
indicesAll += tensor.mgrid[0:numPositions,0:unmaskedId.shape[0]][0] # (batchsize-x)*numPositions
indicesAll = indicesAll.flatten() # 1*(numPositions*(batchsize-x))
# Delete the source sentence index position centered on pstate_ and the window size D, which exceeds 0 or the maximum length of the sentence
includeIds = (indicesAll <= srclen.repeat(numPositions,axis=0).flatten()) & (indicesAll >= 0)
indicesAll = indicesAll[includeIds] # dimensional reductionïŒ1*((numPosition*batchsize-x)-y)
indicesSub = tensor.arange(0,numPositions).repeat(unmaskedId.shape[0]) # Scale-out numPositions times,1*(numPosition*batchsize-x)
indicesSub = indicesSub[includeIds] # 1*((numPosition*batchsize-x)-y)
unmaskedIds = tensor.tile(unmaskedId,numPositions) # Scale-out numPositions times,ïŒ1*(numPosition*batchsize-x)
unmaskedIds = unmaskedIds[includeIds] # 1*((numPosition*batchsize-x)-y)
srcVecsSub = tensor.zeros([numPositions*n_samples,dim]) # 15*3
linearIdSub = indicesSub*n_samples+ unmaskedIds # 1*((numPosition*batchsize-x)-y)
linearIdAll = indicesAll*n_samples+ unmaskedIds # 1*((numPosition*batchsize-x)-y)
cc_ = tensor.reshape(cc_,[nsteps_src * n_samples,dim]) # 35*3
srcVecsSub = tensor.set_subtensor(srcVecsSub[linearIdSub, :], cc_[linearIdAll, :]) # numPositions*n_samples*dim
srcVecsSub = srcVecsSub.reshape([numPositions, n_samples, dim])
e_ij = tensor.dot(srcVecsSub,Wc_att)+b_att # equivalent to eq:W_a*hs+b_a,numPositions * batchSize*dim
e_ij = (h1*e_ij).sum(2) # equivalent to eq:h_t' *(W_a*hs+b_a),numPositions * batchSize
scaleX = (indicesAll - pstate_.repeat(numPositions, axis=0).flatten()[unmaskedIds])/(D/2) # Select the number with the index [unmaskedId] from the vector of size size*numPosition,scaleX.shape == unmaskedId.shape
distWeights = tensor.zeros([numPositions,n_samples])
distWeights = tensor.set_subtensor(distWeights[indicesSub,unmaskedIds],scaleX) # numPositions*batchSize
alpha = e_ij * tensor.exp(-0.5*tensor.square(distWeights))
alpha = alpha - alpha.max(0) # subtract max elements
alpha = tensor.exp(alpha) # numPositions * batchSize
context_mask_ = tensor.zeros([numPositions, n_samples])
context_mask_ = tensor.set_subtensor(context_mask_[indicesSub, unmaskedIds], 1.)
if context_mask_: # context_mask =x_maskïŒnsteps_src * batchSizeïŒbe truncated to numPositions * batchSize
alpha = alpha * context_mask_
alpha_sum = alpha.sum(0,keepdims=True) # âeij
alpha_sum = tensor.switch(alpha_sum,alpha_sum,1.) # if alpua_sum = 0 ->1
alpha = alpha / alpha_sum # (numPositions * batchSize),eij/âeij
# current context,(numPositions*batchSize*dim).sum(0) -->batchSize*dim
ctx_ = (srcVecsSub * alpha[:, :, None]).sum(0)
preact2 = tensor.dot(h1, U_nl) + b_nl
preact2 += tensor.dot(ctx_, Wc)
preact2 = tensor.nnet.sigmoid(preact2)
r2 = _slice(preact2, 0, dim)
u2 = _slice(preact2, 1, dim)
preactx2 = tensor.dot(h1, Ux_nl) + bx_nl
preactx2 *= r2
preactx2 += tensor.dot(ctx_, Wcx)
h2 = tensor.tanh(preactx2)
h2 = u2 * h1 + (1. - u2) * h2
h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
return h2, ctx_, alpha.T
seqs = [context_mask, mask, state_below_, state_belowx]
shared_vars = [tparams['Wc_att'],
tparams['b_att'],
tparams['U'],
tparams['Wc'],
tparams['W_comb_att'],
tparams['U_att'],
tparams['c_tt'],
tparams['Ux'],
tparams['Wcx'],
tparams['U_nl'],
tparams['Ux_nl'],
tparams['b_nl'],
tparams['bx_nl']]
rval, updates = theano.scan(_step_slice,
sequences=seqs,
outputs_info=[init_state,
tensor.alloc(0., n_samples,
cc_.shape[2]),
tensor.alloc(0., n_samples,
cc_.shape[0])],
non_sequences=[cc_,context_mask1]+shared_vars,
name='layers',
n_steps=nsteps_trg,
profile=profile,
strict=True)
proj_h = rval[0]
ctxs = rval[1]
# 3 feedforward neural networks,used to indicate the contribution of proj_h, emb, ctxs to cost
logit_lstm = tensor.dot(proj_h,tparams['ff_logit_lstm_W'])+tparams['ff_logit_lstm_b']
logit_prev = tensor.dot(emb,tparams['ff_logit_prev_W'])+tparams['ff_logit_prev_b']
logit_ctx = tensor.dot(ctxs,tparams['ff_logit_ctx_W'])+tparams['ff_logit_ctx_b']
logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
# a feedforward neural network layer + softmax
logit = tensor.dot(logit,tparams['ff_logit_W'])+tparams['ff_logit_b']
logit_shp = logit.shape
probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1],
logit_shp[2]]))
# cost
cost = -tensor.log(probs.flatten())
cost = cost.reshape([mask.shape[0], mask.shape[1]])
cost = (cost * mask).sum(0)
# make mean(), tensor->scalar
cost = cost.mean()
print('Computing gradient...', end=" ")
grads = tensor.grad(cost, wrt=itemlist(tparams),disconnected_inputs = 'ignore')
print('Done')
print(grads[1].eval())
--
---
You received this message because you are subscribed to the Google Groups "theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to theano-users+***@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
---
You received this message because you are subscribed to the Google Groups "theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to theano-users+***@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.