1

I work on video captioning project and I have two features vectors extracted for each video as numpy files. One of them is with shape of (15,2048) which is extracted by 3D CNN, and the other is with shape of (15,1536) which is extracted by InceptionV2, where 15 is the number of selected frames.

The problem is that the system performance is better when one feature vector is used, while when two features vectors (Inception and 3D CNN features) are combined, the system performance is worse based on BLEU score.

I used the following code for combining the numpy features files. The shape of combined vector is (15,1024) :

import torch
import os
import torch.nn as nn
import numpy as np

feature3d_dim = 2048
feature2d_dim = 1536

hidden_dim = 512


feature3d_proj = nn.Sequential(
    nn.Linear(feature3d_dim, 2 * hidden_dim),
    nn.BatchNorm1d(hidden_dim * 2),
    nn.ReLU(True),
    nn.Dropout(0.5),
    nn.Linear(hidden_dim * 2, hidden_dim)
    )


feature2d_proj = nn.Sequential(
    nn.Linear(feature2d_dim, 2 * hidden_dim),
    nn.BatchNorm1d(hidden_dim * 2),
    nn.ReLU(True),
    nn.Dropout(0.5),
    nn.Linear(hidden_dim * 2, hidden_dim)
    )



video_list = os.listdir('feat_15_c3d')

for video in video_list:
    
    path_3d =  os.path.join('feat_15_c3d', video)
    path_2d =  os.path.join('feat_15_incepresnetV2', video)
    
    f_3d = np.load(path_3d) # ==>(15, 2048)
    f_3d = torch.from_numpy(f_3d) # convert numpy to tensor
    proj_3d = feature3d_proj(f_3d)  # ==>(15, 512) if hidden_dim = 512
    
    
    f_2d = np.load(path_2d) # (15, 1536)
    f_2d = torch.from_numpy(f_2d)
    proj_2d = feature2d_proj(f_2d)  # ==>(15, 512)
    
    combine_vectors = torch.cat([proj_3d, proj_2d], dim=-1) # ==>(15, 1024) 
    combine_vectors = combine_vectors.cpu().detach().numpy() # convert to numpy
   

    outfile = os.path.join('feat_15_1024_2d3d', video)
    
    np.save(outfile, combine_vectors)
A_B_Y
  • 332
  • 2
  • 8

0 Answers0