deep learning - Image Captioning using TensorFlow not giving good results -
i trying build image captioning model on ms coco dataset part of cs231n assignment 3.using numpy code, able achieve moderate results when tried implement whole code in tensorflow, captions generated on validation set random, more precisely able achieve bleu score of 0.26 on validation set using numpy code while tensorflow code not able go beyond 0.19. wanted know if have made mistake in building model. please help, have spent 3 days on without luck.
def build_model( hidden_size = hidden_dims,#dimensions of lstm gates num_steps = 16,#maximum number of words in caption image_size = image_dim,#dimensions of image features vocab_size = vocab_size, embedding_size = word2vec_dim, lr = lr, batch_size = batch_size): reset_graph() image_features = tf.placeholder(tf.float32,[batch_size,image_size],name = 'image_features') captions_in = tf.placeholder(tf.int32,[none,num_steps],name='captions_in') captions_out = tf.placeholder(tf.int32,[none,num_steps],name='captions_out') #initial value of c gate c0 = tf.placeholder_with_default(tf.zeros([batch_size,hidden_size]), [batch_size,hidden_size], name=none) tf.variable_scope('image_embeddings'): w_proj = tf.get_variable('w_proj',[image_size,hidden_size],initializer = tf.random_normal_initializer(stddev=0.01)) b_proj = tf.get_variable('b_proj',[hidden_size]) image_embeddings = tf.matmul(image_features,w_proj) + b_proj #initial value of hidden state h0 = tf.placeholder_with_default(image_embeddings,shape=[batch_size,hidden_size], name=none) tf.variable_scope('embeddings'): embeddings = tf.get_variable('embeddings',[vocab_size,embedding_size],initializer = tf.random_normal_initializer(stddev=0.01)) rnn_inputs = tf.nn.embedding_lookup(embeddings,captions_in) cell = tf.contrib.rnn.lstmcell(hidden_size,state_is_tuple=true) init_state = tf.nn.rnn_cell.lstmstatetuple(c0,h0) rnn_outputs, final_state = tf.nn.dynamic_rnn(cell,rnn_inputs,initial_state = init_state) tf.variable_scope('softmax'): w = tf.get_variable('w',[hidden_size,vocab_size],initializer = tf.random_normal_initializer(stddev=0.01)) b = tf.get_variable('b',[vocab_size]) rnn_outputs = tf.reshape(rnn_outputs,[-1,hidden_size]) logits = tf.reshape(tf.matmul(rnn_outputs,w),[-1,num_steps,vocab_size]) + b logits_reshaped = tf.reshape(logits,[-1,vocab_size]) captions_out_reshaped = tf.reshape(captions_out,[-1]) mask = tf.not_equal(captions_out_reshaped,null) predictions = tf.nn.softmax(logits_reshaped) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits_reshaped, labels = captions_out_reshaped) total_loss = tf.reduce_mean(tf.where(mask,cross_entropy,tf.zeros_like(cross_entropy))) global_step = tf.variable(0, trainable=false) learning_rate = tf.train.exponential_decay(lr, global_step, 200, 0.995, staircase=true) train_step = tf.train.adamoptimizer(learning_rate).minimize(total_loss,global_step) return dict( image_features = image_features, captions_in = captions_in, captions_out = captions_out, c0 = c0, h0 = h0, final_state = final_state, total_loss = total_loss, train_step = train_step, preds = predictions, cross_entropy = cross_entropy, global_step = global_step, saver = tf.train.saver())
Comments
Post a Comment