Good afternoon!
I’m building a multiple-input model with 2 types of inputs:
Images (torch.Size([1, 3, 224, 224]))
and landmark features (torch.Size([1, 96]))
.
Here’s the model itself:
class MixedNetwork(nn.Module):
def __init__(self):
super(MixedNetwork, self).__init__()
image_modules = list(models.resnet50().children())[:-1]
self.image_features = nn.Sequential(*image_modules)
self.landmark_features = nn.Sequential(
nn.Linear(in_features=96, out_features=192,bias=False),
nn.ReLU(inplace=True),
nn.Dropout(p=0.25),
nn.Linear(in_features=192,out_features=1000,bias=False),
nn.ReLU(inplace=True),
nn.Dropout(p=0.25))
self.combined_features = nn.Sequential(
nn.Linear(1000, 512),
nn.ReLU(),
nn.Linear(512, 32),
nn.ReLU(),
nn.Linear(32,1))
def forward(self, image, landmarks):
a = self.image_features(image)
print(a.shape)
b = self.landmark_features(landmarks)
x = torch.cat((a.view(a.size(0), -1), b.view(b.size(0), -1)), dim=1)
x = self.combined_features(x)
x = F.sigmoid(x)
return x
I’m getting confused when it comes to defining input-output features for Linear layers and combined layers. The last FC layer of resnet50 Linear(in_features=2048, out_features=1000). Does it mean that the last output of self.landmark_features layers also has to be 1000 and the first linear layer of self.combined_features should also be 1000?
Is it correct to assume that if the landmark input size is [1, 96] then the in_features for the first layer of self.landmark_features has to be 96?
With the current dimensions I’m getting the error message: RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x3048 and 1000x512) (why 3048 and not 2048?)
CodePudding user response:
This code makes image shape(1, 3, 224, 224) to (1, 2048)
a = self.image_features(image)
a = a.view(a.size(0), -1)
And this code makes landmark features shape(1, 96) to (1,1000)
b = self.landmark_features(landmarks)
a = b.view(b.size(0), -1)
And torch cat makes concatenate a, b vectors. So, self.combined_features(x) in x's shape is (1, 3048)
So, the code should be changed to the following:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torch.nn.functional as F
class MixedNetwork(nn.Module):
def __init__(self):
super(MixedNetwork, self).__init__()
image_modules = list(models.resnet50().children())[:-1]
self.image_features = nn.Sequential(*image_modules)
self.landmark_features = nn.Sequential(
nn.Linear(in_features=96, out_features=192,bias=False),
nn.ReLU(inplace=True),
nn.Dropout(p=0.25),
nn.Linear(in_features=192,out_features=1000,bias=False),
nn.ReLU(inplace=True),
nn.Dropout(p=0.25))
self.combined_features = nn.Sequential(
# change this input nodes
nn.Linear(3048, 512),
nn.ReLU(),
nn.Linear(512, 32),
nn.ReLU(),
nn.Linear(32,1))
def forward(self, image, landmarks):
a = self.image_features(image)
b = self.landmark_features(landmarks)
x = torch.cat((a.view(a.size(0), -1), b.view(b.size(0), -1)), dim=1)
x = self.combined_features(x)
x = torch.sigmoid(x)
return x
model = MixedNetwork()
batch_size = 1
# random input
image = torch.randn(batch_size, 3, 224, 224)
land = torch.randn(batch_size, 96)
output = model(image, land)
print(output)