deep learning - Image Captioning using TensorFlow not giving good results -

July 15, 2011

i trying build image captioning model on ms coco dataset part of cs231n assignment 3.using numpy code, able achieve moderate results when tried implement whole code in tensorflow, captions generated on validation set random, more precisely able achieve bleu score of 0.26 on validation set using numpy code while tensorflow code not able go beyond 0.19. wanted know if have made mistake in building model. please help, have spent 3 days on without luck.

def build_model(     hidden_size = hidden_dims,#dimensions of lstm gates     num_steps = 16,#maximum number of words in caption     image_size = image_dim,#dimensions of image features     vocab_size = vocab_size,     embedding_size = word2vec_dim,     lr = lr,     batch_size = batch_size):      reset_graph()      image_features = tf.placeholder(tf.float32,[batch_size,image_size],name = 'image_features')      captions_in = tf.placeholder(tf.int32,[none,num_steps],name='captions_in')      captions_out = tf.placeholder(tf.int32,[none,num_steps],name='captions_out')      #initial value of c gate     c0 = tf.placeholder_with_default(tf.zeros([batch_size,hidden_size]), [batch_size,hidden_size], name=none)      tf.variable_scope('image_embeddings'):         w_proj = tf.get_variable('w_proj',[image_size,hidden_size],initializer = tf.random_normal_initializer(stddev=0.01))         b_proj = tf.get_variable('b_proj',[hidden_size])      image_embeddings = tf.matmul(image_features,w_proj) + b_proj     #initial value of hidden state     h0 = tf.placeholder_with_default(image_embeddings,shape=[batch_size,hidden_size], name=none)      tf.variable_scope('embeddings'):         embeddings = tf.get_variable('embeddings',[vocab_size,embedding_size],initializer = tf.random_normal_initializer(stddev=0.01))     rnn_inputs = tf.nn.embedding_lookup(embeddings,captions_in)      cell = tf.contrib.rnn.lstmcell(hidden_size,state_is_tuple=true)     init_state = tf.nn.rnn_cell.lstmstatetuple(c0,h0)     rnn_outputs, final_state = tf.nn.dynamic_rnn(cell,rnn_inputs,initial_state = init_state)      tf.variable_scope('softmax'):         w = tf.get_variable('w',[hidden_size,vocab_size],initializer = tf.random_normal_initializer(stddev=0.01))         b = tf.get_variable('b',[vocab_size])      rnn_outputs = tf.reshape(rnn_outputs,[-1,hidden_size])     logits = tf.reshape(tf.matmul(rnn_outputs,w),[-1,num_steps,vocab_size]) + b     logits_reshaped = tf.reshape(logits,[-1,vocab_size])      captions_out_reshaped = tf.reshape(captions_out,[-1])     mask = tf.not_equal(captions_out_reshaped,null)     predictions = tf.nn.softmax(logits_reshaped)      cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits_reshaped, labels = captions_out_reshaped)     total_loss = tf.reduce_mean(tf.where(mask,cross_entropy,tf.zeros_like(cross_entropy)))     global_step = tf.variable(0, trainable=false)      learning_rate = tf.train.exponential_decay(lr, global_step,                                            200, 0.995, staircase=true)     train_step = tf.train.adamoptimizer(learning_rate).minimize(total_loss,global_step)      return dict(         image_features = image_features,         captions_in = captions_in,         captions_out = captions_out,         c0 = c0,         h0 = h0,         final_state = final_state,         total_loss = total_loss,         train_step = train_step,         preds = predictions,         cross_entropy = cross_entropy,         global_step = global_step,         saver = tf.train.saver())

Search This Blog

Force Net

deep learning - Image Captioning using TensorFlow not giving good results -

Comments

Post a Comment

Popular posts from this blog

ubuntu - PHP script to find files of certain extensions in a directory, returns populated array when run in browser, but empty array when run from terminal -

php - How can i create a user dashboard -

javascript - How to detect toggling of the fullscreen-toolbar in jQuery Mobile? -