-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcap_net_w_pretrained_vgg.py
88 lines (65 loc) · 2.17 KB
/
cap_net_w_pretrained_vgg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torch
import torch.nn as nn
from torch.autograd import Variable
import word_embedding
import pdb
import numpy as np
#load vgg16 with pretrained weights
import torchvision.models as models
our_vgg = models.vgg16(pretrained=True)
#cuda variables
use_gpu = torch.cuda.is_available()
# Input dimensions of VGG16 input image
VGG_IMG_DIM = 224
# Recurrent size must be same as last hidden layer off VGG16
RNN_HIDDEN_SIZE = 4096
# Dimension of word embeddings
WORDVEC_SIZE = 300
# Assume a limited language model consisting of this many words
VOCABULARY_SIZE = 6000
our_vgg.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, RNN_HIDDEN_SIZE),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(RNN_HIDDEN_SIZE, num_classes),
)
class CaptionNet(nn.Module):
def __init__(self, pretrained_model):
super(CaptionNet, self).__init__()
# Make VGG net
self.pretrained = pretrained_model
# Recurrent layer
self.rnn_cell = nn.RNNCell(
input_size = WORDVEC_SIZE,
hidden_size = RNN_HIDDEN_SIZE,
nonlinearity = 'relu',
)
# Linear layer to convert hidden layer to word in vocab
self.hidden_to_vocab = nn.Linear(RNN_HIDDEN_SIZE, VOCABULARY_SIZE)
def forward(self, img):
"""Forward pass through network
Input: image tensor
Output: sequence of words
"""
hidden = self.pretrained(img)
# First input is zero vector
next_input = Variable(torch.zeros(WORDVEC_SIZE)).cuda()
# For now, let's just generate 10 words (should actually generate until end token)
words = []
for _ in range(10):
hidden = self.rnn_cell(next_input, hidden)
word_class = self.hidden_to_vocab(hidden)
_, word_ix = torch.max(word_class, 1)
word_ix = int(word_ix)
cur_word = word_embedding.get_word_from_index(word_ix)
words.append(cur_word)
# Update input to next layer
next_input = Variable(word_embedding.get_word_embedding(cur_word)).cuda()
return words
def forward_perplexity(self, img, words):
"""Given image and ground-truth caption, compute negative log likelihood perplexity"""
# Todo
return 0