[theano-users] ValueError: could not broadcast input array from shape (5,3) into shape (5,7)

ru peng

2018-07-31 13:36:46 UTC

I checked my code many times and made sure that it didn't exist. I don't
know which part of the code caused this error. Can anyone tell me? Any
help, be grateful.

import numpy as np
from theano import tensor
import theano
from collections import OrderedDict

theano.config.floatX = "float32"
profile=False

theano.config.exception_verbosity='high'
theano.config.optimizer='None'

def ortho_weight(ndim):
W = np.random.randn(ndim, ndim)
u, s, v = np.linalg.svd(W) # Singular value decomposition for matrix W of ndim*ndimïŒW=US(V*H)
return u.astype(theano.config.floatX)

def norm_weight(nin, nout=None, scale=0.01, ortho=True):
if nout is None:
nout = nin
if nout == nin and ortho:
W = ortho_weight(nin) # Return a nin*nin size Unitary Matrix after singular value decomposition
else:
W = scale * np.random.randn(nin, nout) # randomly generate a numpy matrix of nin*nout, scale all items inside
return W.astype(theano.config.floatX)

def init_tparams(params):
tparams = OrderedDict()
for kk, pp in params.items():
tparams[kk] = theano.shared(params[kk],name=kk) # shared variables

return tparams

def itemlist(tparams):
return [vv for kk, vv in tparams.items()]

# params
params = OrderedDict() # Create an ordered dictionary

context_mask = np.matrix([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX) # n_steps_src = 7 , n_samples = 5
mask = np.matrix([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX)

D = 1
numPositions = 2*D+1 # 3
nsteps_src = context_mask.shape[0]
nsteps_trg = mask.shape[0]
n_samples = context_mask.shape[1]
n_words = nsteps_trg*n_samples
dim_word = 6

params['Wemb_dec'] = norm_weight(n_words, dim_word) # 35*6
emb = theano.shared(params['Wemb_dec']) # shard variables
emb = emb.reshape([nsteps_trg, n_samples, dim_word])
emb_shifted = tensor.zeros_like(emb)
emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) # Set the first timestps to 0, others no change
emb = emb_shifted

cc_ = 0.01*np.random.randn(nsteps_src,n_samples,4).astype(theano.config.floatX)
dim = cc_.shape[2] # dim = 4

context_mask = theano.shared(context_mask) # shard variables
context_mask1 = context_mask # add to non-seuqences,cause will use <context_mask> when computed <srclen> in scan()
mask = theano.shared(mask)
cc_ = theano.shared(cc_)

# These weight matrix and bias term are to be used for feedforward neural network,Respectively
params['ff_state_W'] = norm_weight(dim, dim, scale=0.01, ortho=True) #
params['ff_state_b'] = np.zeros((dim,)).astype(theano.config.floatX)

params['ff_logit_lstm_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_lstm_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)

params['ff_logit_prev_W'] = norm_weight(dim_word, dim_word, scale=0.01, ortho=False)
params['ff_logit_prev_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)

params['ff_logit_ctx_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_ctx_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)

params['ff_logit_W'] = norm_weight(dim_word, n_words, scale=0.01, ortho=True)
params['ff_logit_b'] = np.zeros((n_words,)).astype(theano.config.floatX)

# These parameters will be used for Gated Recurrent Unit(GRU) in scan().
params['W'] = np.concatenate([norm_weight(dim_word, dim),norm_weight(dim_word, dim)], axis=1) # dim_word*2dim
params['b'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim

params['U'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns

params['Wx'] = norm_weight(dim_word, dim) # dim_word*dim

params['Ux'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx'] = np.zeros((dim,)).astype(theano.config.floatX) # dim

params['U_nl'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['b_nl'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim

params['Ux_nl'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx_nl'] = np.zeros((dim,)).astype(theano.config.floatX) # dim

params['Wc'] = norm_weight(dim, dim*2) # dim * 2dim
params['Wcx'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)

params['W_comb_att'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['Wc_att'] = norm_weight(dim) # svd matrices (size :dim*dim)

params['b_att'] = np.zeros((dim,)).astype(theano.config.floatX) # dim

params['U_att'] = norm_weight(dim, 1) # dim*1
params['c_tt'] = np.zeros((1,)).astype(theano.config.floatX)

tparams = init_tparams(params)

def _slice(_x, n, dim): # slice() for tensor,get [:,n*dim:(n+1)*dim] of tensor
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]

ctx_mean = (cc_ * context_mask[:, :, None]).sum(0) / context_mask.sum(0)[:, None]
init_state = tensor.tanh(tensor.dot(ctx_mean,tparams['ff_state_W'])+tparams['ff_state_b']) # Initialization of h_ in _step_slice of scan()

state_belowx = tensor.dot(emb, tparams['Wx']) +tparams['bx'] # state_belowx = emb*tparams['Wx']+tparams['bx']
state_below_ = tensor.dot(emb, tparams['W']) + tparams['b'] # state_below_ = emb*tparams['W']+tparams['b']

def _step_slice(m, m_, x_, xx_, h_, ctx_, alpha_,cc_,context_mask1,
Wc_att, b_att,U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
U_nl, Ux_nl, b_nl, bx_nl):
preact1 = tensor.dot(h_, U)
preact1 += x_
preact1 = tensor.nnet.sigmoid(preact1)

r1 = _slice(preact1, 0, dim)
u1 = _slice(preact1, 1, dim)

preactx1 = tensor.dot(h_, Ux)
preactx1 *= r1
preactx1 += xx_

h1 = tensor.tanh(preactx1)

h1 = u1 * h_ + (1. - u1) * h1
h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ # h1 is source hidden state ,batchsize*dim

srclen = (context_mask1.sum(0,keepdims=True)- 1) # context_mask.sum(0,keepdims=True)-1 Corresponding to source sentence length of this batch
pctx__ = tensor.tanh(tensor.dot(h1, W_comb_att))
pctx__ = tensor.dot(pctx__, U_att) + c_tt # batchsize*1
pstate_ = srclen * tensor.nnet.sigmoid(pctx__.T) + 1 # +1ïŒeos is considered
srcPositions = tensor.floor(pstate_) # batchsize*1
srcPositions = tensor.cast(srcPositions, dtype='int32') # srcPosition is index, so shoud cast to int,like 3.6-->3.0

unmaskedId = tensor.flatnonzero(m) # per timesteps ,take per row element of the source mask matrix as m ,(batchsize-x)
srcPositions = srcPositions[:,unmaskedId] # (batchsize-x)*1
srclen = srclen[:,unmaskedId] # (batchsize-x)*1
startAttnIds = srcPositions - D
indicesAll = startAttnIds.repeat(numPositions, axis=0)
indicesAll += tensor.mgrid[0:numPositions,0:unmaskedId.shape[0]][0] # (batchsize-x)*numPositions
indicesAll = indicesAll.flatten() # 1*(numPositions*(batchsize-x))

# Delete the source sentence index position centered on pstate_ and the window size D, which exceeds 0 or the maximum length of the sentence
includeIds = (indicesAll <= srclen.repeat(numPositions,axis=0).flatten()) & (indicesAll >= 0)
indicesAll = indicesAll[includeIds] # dimensional reductionïŒ1*((numPosition*batchsize-x)-y)

indicesSub = tensor.arange(0,numPositions).repeat(unmaskedId.shape[0]) # Scale-out numPositions times,1*(numPosition*batchsize-x)
indicesSub = indicesSub[includeIds] # 1*((numPosition*batchsize-x)-y)
unmaskedIds = tensor.tile(unmaskedId,numPositions) # Scale-out numPositions times,ïŒ1*(numPosition*batchsize-x)
unmaskedIds = unmaskedIds[includeIds] # 1*((numPosition*batchsize-x)-y)

srcVecsSub = tensor.zeros([numPositions*n_samples,dim]) # 15*3
linearIdSub = indicesSub*n_samples+ unmaskedIds # 1*((numPosition*batchsize-x)-y)
linearIdAll = indicesAll*n_samples+ unmaskedIds # 1*((numPosition*batchsize-x)-y)
cc_ = tensor.reshape(cc_,[nsteps_src * n_samples,dim]) # 35*3
srcVecsSub = tensor.set_subtensor(srcVecsSub[linearIdSub, :], cc_[linearIdAll, :]) # numPositions*n_samples*dim
srcVecsSub = srcVecsSub.reshape([numPositions, n_samples, dim])

e_ij = tensor.dot(srcVecsSub,Wc_att)+b_att # equivalent to eq:W_a*hs+b_a,numPositions * batchSize*dim
e_ij = (h1*e_ij).sum(2) # equivalent to eq:h_t' *(W_a*hs+b_a),numPositions * batchSize

scaleX = (indicesAll - pstate_.repeat(numPositions, axis=0).flatten()[unmaskedIds])/(D/2) # Select the number with the index [unmaskedId] from the vector of size size*numPosition,scaleX.shape == unmaskedId.shape
distWeights = tensor.zeros([numPositions,n_samples])
distWeights = tensor.set_subtensor(distWeights[indicesSub,unmaskedIds],scaleX) # numPositions*batchSize

alpha = e_ij * tensor.exp(-0.5*tensor.square(distWeights))
alpha = alpha - alpha.max(0) # subtract max elements
alpha = tensor.exp(alpha) # numPositions * batchSize

context_mask_ = tensor.zeros([numPositions, n_samples])
context_mask_ = tensor.set_subtensor(context_mask_[indicesSub, unmaskedIds], 1.)
if context_mask_: # context_mask =x_maskïŒnsteps_src * batchSizeïŒbe truncated to numPositions * batchSize
alpha = alpha * context_mask_
alpha_sum = alpha.sum(0,keepdims=True) # âeij
alpha_sum = tensor.switch(alpha_sum,alpha_sum,1.) # if alpua_sum = 0 ->1
alpha = alpha / alpha_sum # (numPositions * batchSize),eij/âeij

# current context,(numPositions*batchSize*dim).sum(0) -->batchSize*dim
ctx_ = (srcVecsSub * alpha[:, :, None]).sum(0)

preact2 = tensor.dot(h1, U_nl) + b_nl
preact2 += tensor.dot(ctx_, Wc)
preact2 = tensor.nnet.sigmoid(preact2)

r2 = _slice(preact2, 0, dim)
u2 = _slice(preact2, 1, dim)

preactx2 = tensor.dot(h1, Ux_nl) + bx_nl
preactx2 *= r2
preactx2 += tensor.dot(ctx_, Wcx)

h2 = tensor.tanh(preactx2)

h2 = u2 * h1 + (1. - u2) * h2
h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1

return h2, ctx_, alpha.T

seqs = [context_mask, mask, state_below_, state_belowx]

shared_vars = [tparams['Wc_att'],
tparams['b_att'],
tparams['U'],
tparams['Wc'],
tparams['W_comb_att'],
tparams['U_att'],
tparams['c_tt'],
tparams['Ux'],
tparams['Wcx'],
tparams['U_nl'],
tparams['Ux_nl'],
tparams['b_nl'],
tparams['bx_nl']]

rval, updates = theano.scan(_step_slice,
sequences=seqs,
outputs_info=[init_state,
tensor.alloc(0., n_samples,
cc_.shape[2]),
tensor.alloc(0., n_samples,
cc_.shape[0])],
non_sequences=[cc_,context_mask1]+shared_vars,
name='layers',
n_steps=nsteps_trg,
profile=profile,
strict=True)

proj_h = rval[0]
ctxs = rval[1]

# 3 feedforward neural networks,used to indicate the contribution of proj_h, emb, ctxs to cost
logit_lstm = tensor.dot(proj_h,tparams['ff_logit_lstm_W'])+tparams['ff_logit_lstm_b']

logit_prev = tensor.dot(emb,tparams['ff_logit_prev_W'])+tparams['ff_logit_prev_b']

logit_ctx = tensor.dot(ctxs,tparams['ff_logit_ctx_W'])+tparams['ff_logit_ctx_b']

logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)

# a feedforward neural network layer + softmax
logit = tensor.dot(logit,tparams['ff_logit_W'])+tparams['ff_logit_b']
logit_shp = logit.shape
probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1],
logit_shp[2]]))
# cost
cost = -tensor.log(probs.flatten())
cost = cost.reshape([mask.shape[0], mask.shape[1]])
cost = (cost * mask).sum(0)

# make mean(), tensor->scalar
cost = cost.mean()

print('Computing gradient...', end=" ")
grads = tensor.grad(cost, wrt=itemlist(tparams),disconnected_inputs = 'ignore')
print('Done')
print(grads[1].eval())

--
---
You received this message because you are subscribed to the Google Groups "theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to theano-users+***@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

ru peng

2018-08-01 02:25:26 UTC

Permalink

I think it may be the same as the following question, from google+ theano
users.

"I'm getting an error computing gradients through a scan in which some
intermediate values of the scan function have different sizes in different
iterations (the inputs and outputs always have the same size). Here's a
minimal example:"

import numpy as np
import theano
import theano.tensor as T

d = 11
h = 7
W1 = theano.shared(name='W1', value=np.random.uniform(-0.1, 0.1, (d,h)))
W2 = theano.shared(name='W2', value=np.random.uniform(-0.1, 0.1, (h,)))

n = T.lscalar('n')
vecs = T.matrix('vecs')
inds = T.lmatrix('inds')
def recurrence(t, vecs, inds, W1, W2):
cur_inds = inds[T.eq(inds[:,0], t).nonzero()]
cur_vecs = vecs[cur_inds[:,1]]
hidden_layers = T.tanh(cur_vecs.dot(W1))
scores = hidden_layers.dot(W2)
return T.sum(scores)

results, _ = theano.scan(
fn=recurrence, sequences=[T.arange(n)], outputs_info=[None],
non_sequences=[vecs, inds, W1, W2], strict=True)

obj = T.sum(results)
grads = T.grad(obj, [W1, W2])
f = theano.function(inputs=[n, vecs, inds], outputs=grads)
vecs_in = np.ones((10, d))
inds_in = np.array([[0, 0], [1, 1], [1, 2], [2, 3], [3, 4], [3, 5], [3, 6], [3, 7], [4, 8], [4, 9]])
print f(5, vecs_in, inds_in)

A couple observations:

- There's no error if I turn off optimizations (theano.config.optimizer
= 'None')
- There's no error if I have a single layer and no hidden layer (i.e. if
scores = cur_vecs.dot(W) for W of the appropriate shape).

åš 2018å¹Ž7æ31æ¥ææäº UTC+8äžå9:36:46ïŒru pengåéïŒ

Post by ru peng
I checked my code many times and made sure that it didn't exist. I don't
know which part of the code caused this error. Can anyone tell me? Any
help, be grateful.
import numpy as np
from theano import tensor
import theano
from collections import OrderedDict
theano.config.floatX = "float32"
profile=False
theano.config.exception_verbosity='high'
theano.config.optimizer='None'
W = np.random.randn(ndim, ndim)
u, s, v = np.linalg.svd(W) # Singular value decomposition for matrix W of ndim*ndimïŒW=US(V*H)
return u.astype(theano.config.floatX)
nout = nin
W = ortho_weight(nin) # Return a nin*nin size Unitary Matrix after singular value decomposition
W = scale * np.random.randn(nin, nout) # randomly generate a numpy matrix of nin*nout, scale all items inside
return W.astype(theano.config.floatX)
tparams = OrderedDict()
tparams[kk] = theano.shared(params[kk],name=kk) # shared variables
return tparams
return [vv for kk, vv in tparams.items()]
# params
params = OrderedDict() # Create an ordered dictionary
context_mask = np.matrix([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX) # n_steps_src = 7 , n_samples = 5
mask = np.matrix([[1. ,1. ,1. ,1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 0.],
[1., 1., 0., 1., 0.],
[1., 0., 0., 0., 0.]]).astype(theano.config.floatX)
D = 1
numPositions = 2*D+1 # 3
nsteps_src = context_mask.shape[0]
nsteps_trg = mask.shape[0]
n_samples = context_mask.shape[1]
n_words = nsteps_trg*n_samples
dim_word = 6
params['Wemb_dec'] = norm_weight(n_words, dim_word) # 35*6
emb = theano.shared(params['Wemb_dec']) # shard variables
emb = emb.reshape([nsteps_trg, n_samples, dim_word])
emb_shifted = tensor.zeros_like(emb)
emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) # Set the first timestps to 0, others no change
emb = emb_shifted
cc_ = 0.01*np.random.randn(nsteps_src,n_samples,4).astype(theano.config.floatX)
dim = cc_.shape[2] # dim = 4
context_mask = theano.shared(context_mask) # shard variables
context_mask1 = context_mask # add to non-seuqences,cause will use <context_mask> when computed <srclen> in scan()
mask = theano.shared(mask)
cc_ = theano.shared(cc_)
# These weight matrix and bias term are to be used for feedforward neural network,Respectively
params['ff_state_W'] = norm_weight(dim, dim, scale=0.01, ortho=True) #
params['ff_state_b'] = np.zeros((dim,)).astype(theano.config.floatX)
params['ff_logit_lstm_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_lstm_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_prev_W'] = norm_weight(dim_word, dim_word, scale=0.01, ortho=False)
params['ff_logit_prev_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_ctx_W'] = norm_weight(dim, dim_word, scale=0.01, ortho=False)
params['ff_logit_ctx_b'] = np.zeros((dim_word,)).astype(theano.config.floatX)
params['ff_logit_W'] = norm_weight(dim_word, n_words, scale=0.01, ortho=True)
params['ff_logit_b'] = np.zeros((n_words,)).astype(theano.config.floatX)
# These parameters will be used for Gated Recurrent Unit(GRU) in scan().
params['W'] = np.concatenate([norm_weight(dim_word, dim),norm_weight(dim_word, dim)], axis=1) # dim_word*2dim
params['b'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim
params['U'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['Wx'] = norm_weight(dim_word, dim) # dim_word*dim
params['Ux'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['U_nl'] = np.concatenate([ortho_weight(dim),ortho_weight(dim)], axis=1) # 2 svd matrices (size :dim*dim) connected by columns
params['b_nl'] = np.zeros((2 * dim,)).astype(theano.config.floatX) # 2dim
params['Ux_nl'] = ortho_weight(dim) # svd matrices (size :dim*dim)
params['bx_nl'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['Wc'] = norm_weight(dim, dim*2) # dim * 2dim
params['Wcx'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['W_comb_att'] = norm_weight(dim, dim) # svd matrices (size :dim*dim)
params['Wc_att'] = norm_weight(dim) # svd matrices (size :dim*dim)
params['b_att'] = np.zeros((dim,)).astype(theano.config.floatX) # dim
params['U_att'] = norm_weight(dim, 1) # dim*1
params['c_tt'] = np.zeros((1,)).astype(theano.config.floatX)
tparams = init_tparams(params)
def _slice(_x, n, dim): # slice() for tensor,get [:,n*dim:(n+1)*dim] of tensor
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
ctx_mean = (cc_ * context_mask[:, :, None]).sum(0) / context_mask.sum(0)[:, None]
init_state = tensor.tanh(tensor.dot(ctx_mean,tparams['ff_state_W'])+tparams['ff_state_b']) # Initialization of h_ in _step_slice of scan()
state_belowx = tensor.dot(emb, tparams['Wx']) +tparams['bx'] # state_belowx = emb*tparams['Wx']+tparams['bx']
state_below_ = tensor.dot(emb, tparams['W']) + tparams['b'] # state_below_ = emb*tparams['W']+tparams['b']
def _step_slice(m, m_, x_, xx_, h_, ctx_, alpha_,cc_,context_mask1,
Wc_att, b_att,U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
preact1 = tensor.dot(h_, U)
preact1 += x_
preact1 = tensor.nnet.sigmoid(preact1)
r1 = _slice(preact1, 0, dim)
u1 = _slice(preact1, 1, dim)
preactx1 = tensor.dot(h_, Ux)
preactx1 *= r1
preactx1 += xx_
h1 = tensor.tanh(preactx1)
h1 = u1 * h_ + (1. - u1) * h1
h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ # h1 is source hidden state ,batchsize*dim
srclen = (context_mask1.sum(0,keepdims=True)- 1) # context_mask.sum(0,keepdims=True)-1 Corresponding to source sentence length of this batch
pctx__ = tensor.tanh(tensor.dot(h1, W_comb_att))
pctx__ = tensor.dot(pctx__, U_att) + c_tt # batchsize*1
pstate_ = srclen * tensor.nnet.sigmoid(pctx__.T) + 1 # +1ïŒeos is considered
srcPositions = tensor.floor(pstate_) # batchsize*1
srcPositions = tensor.cast(srcPositions, dtype='int32') # srcPosition is index, so shoud cast to int,like 3.6-->3.0
unmaskedId = tensor.flatnonzero(m) # per timesteps ,take per row element of the source mask matrix as m ,(batchsize-x)
srcPositions = srcPositions[:,unmaskedId] # (batchsize-x)*1
srclen = srclen[:,unmaskedId] # (batchsize-x)*1
startAttnIds = srcPositions - D
indicesAll = startAttnIds.repeat(numPositions, axis=0)
indicesAll += tensor.mgrid[0:numPositions,0:unmaskedId.shape[0]][0] # (batchsize-x)*numPositions
indicesAll = indicesAll.flatten() # 1*(numPositions*(batchsize-x))
# Delete the source sentence index position centered on pstate_ and the window size D, which exceeds 0 or the maximum length of the sentence
includeIds = (indicesAll <= srclen.repeat(numPositions,axis=0).flatten()) & (indicesAll >= 0)
indicesAll = indicesAll[includeIds] # dimensional reductionïŒ1*((numPosition*batchsize-x)-y)
indicesSub = tensor.arange(0,numPositions).repeat(unmaskedId.shape[0]) # Scale-out numPositions times,1*(numPosition*batchsize-x)
indicesSub = indicesSub[includeIds] # 1*((numPosition*batchsize-x)-y)
unmaskedIds = tensor.tile(unmaskedId,numPositions) # Scale-out numPositions times,ïŒ1*(numPosition*batchsize-x)
unmaskedIds = unmaskedIds[includeIds] # 1*((numPosition*batchsize-x)-y)
srcVecsSub = tensor.zeros([numPositions*n_samples,dim]) # 15*3
linearIdSub = indicesSub*n_samples+ unmaskedIds # 1*((numPosition*batchsize-x)-y)
linearIdAll = indicesAll*n_samples+ unmaskedIds # 1*((numPosition*batchsize-x)-y)
cc_ = tensor.reshape(cc_,[nsteps_src * n_samples,dim]) # 35*3
srcVecsSub = tensor.set_subtensor(srcVecsSub[linearIdSub, :], cc_[linearIdAll, :]) # numPositions*n_samples*dim
srcVecsSub = srcVecsSub.reshape([numPositions, n_samples, dim])
e_ij = tensor.dot(srcVecsSub,Wc_att)+b_att # equivalent to eq:W_a*hs+b_a,numPositions * batchSize*dim
e_ij = (h1*e_ij).sum(2) # equivalent to eq:h_t' *(W_a*hs+b_a),numPositions * batchSize
scaleX = (indicesAll - pstate_.repeat(numPositions, axis=0).flatten()[unmaskedIds])/(D/2) # Select the number with the index [unmaskedId] from the vector of size size*numPosition,scaleX.shape == unmaskedId.shape
distWeights = tensor.zeros([numPositions,n_samples])
distWeights = tensor.set_subtensor(distWeights[indicesSub,unmaskedIds],scaleX) # numPositions*batchSize
alpha = e_ij * tensor.exp(-0.5*tensor.square(distWeights))
alpha = alpha - alpha.max(0) # subtract max elements
alpha = tensor.exp(alpha) # numPositions * batchSize
context_mask_ = tensor.zeros([numPositions, n_samples])
context_mask_ = tensor.set_subtensor(context_mask_[indicesSub, unmaskedIds], 1.)
if context_mask_: # context_mask =x_maskïŒnsteps_src * batchSizeïŒbe truncated to numPositions * batchSize
alpha = alpha * context_mask_
alpha_sum = alpha.sum(0,keepdims=True) # âeij
alpha_sum = tensor.switch(alpha_sum,alpha_sum,1.) # if alpua_sum = 0 ->1
alpha = alpha / alpha_sum # (numPositions * batchSize),eij/âeij
# current context,(numPositions*batchSize*dim).sum(0) -->batchSize*dim
ctx_ = (srcVecsSub * alpha[:, :, None]).sum(0)
preact2 = tensor.dot(h1, U_nl) + b_nl
preact2 += tensor.dot(ctx_, Wc)
preact2 = tensor.nnet.sigmoid(preact2)
r2 = _slice(preact2, 0, dim)
u2 = _slice(preact2, 1, dim)
preactx2 = tensor.dot(h1, Ux_nl) + bx_nl
preactx2 *= r2
preactx2 += tensor.dot(ctx_, Wcx)
h2 = tensor.tanh(preactx2)
h2 = u2 * h1 + (1. - u2) * h2
h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
return h2, ctx_, alpha.T
seqs = [context_mask, mask, state_below_, state_belowx]
shared_vars = [tparams['Wc_att'],
tparams['b_att'],
tparams['U'],
tparams['Wc'],
tparams['W_comb_att'],
tparams['U_att'],
tparams['c_tt'],
tparams['Ux'],
tparams['Wcx'],
tparams['U_nl'],
tparams['Ux_nl'],
tparams['b_nl'],
tparams['bx_nl']]
rval, updates = theano.scan(_step_slice,
sequences=seqs,
outputs_info=[init_state,
tensor.alloc(0., n_samples,
cc_.shape[2]),
tensor.alloc(0., n_samples,
cc_.shape[0])],
non_sequences=[cc_,context_mask1]+shared_vars,
name='layers',
n_steps=nsteps_trg,
profile=profile,
strict=True)
proj_h = rval[0]
ctxs = rval[1]
# 3 feedforward neural networks,used to indicate the contribution of proj_h, emb, ctxs to cost
logit_lstm = tensor.dot(proj_h,tparams['ff_logit_lstm_W'])+tparams['ff_logit_lstm_b']
logit_prev = tensor.dot(emb,tparams['ff_logit_prev_W'])+tparams['ff_logit_prev_b']
logit_ctx = tensor.dot(ctxs,tparams['ff_logit_ctx_W'])+tparams['ff_logit_ctx_b']
logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
# a feedforward neural network layer + softmax
logit = tensor.dot(logit,tparams['ff_logit_W'])+tparams['ff_logit_b']
logit_shp = logit.shape
probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1],
logit_shp[2]]))
# cost
cost = -tensor.log(probs.flatten())
cost = cost.reshape([mask.shape[0], mask.shape[1]])
cost = (cost * mask).sum(0)
# make mean(), tensor->scalar
cost = cost.mean()
print('Computing gradient...', end=" ")
grads = tensor.grad(cost, wrt=itemlist(tparams),disconnected_inputs = 'ignore')
print('Done')
print(grads[1].eval())