Issue
I have my model and inputs moved on the same device but I still get the runtime error :
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)
Here is my code, First my model implementation :
import torch
import torch.nn.functional as F
class Net(torch.nn.Module):
def __init__(self, n_hiddens, n_feature= 2, n_output= 1):
super().__init__()
self.hiddens = []
n_hidden_in = n_feature
for n_hidden in n_hiddens :
self.hiddens.append( torch.nn.Linear(n_hidden_in, n_hidden) ) # hidden layer
n_hidden_in = n_hidden
self.predict = torch.nn.Linear(n_hidden, n_output) # output layer
def forward(self, x):
for hidden in self.hiddens :
x = F.relu(hidden(x)) # activation function for hidden layer
x = self.predict(x) # linear output
return x
Then I define my dataloaders. Here, X and y are numpy arrays
from torch.utils.data import TensorDataset, DataLoader
# Split training/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 42)
X_train_tensor = torch.from_numpy(X_train)
y_train_tensor = torch.from_numpy(y_train)
X_test_tensor = torch.from_numpy(X_test)
y_test_tensor = torch.from_numpy(y_test)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor) # create your datset
train_dataloader = DataLoader(train_dataset, batch_size= 1000) # create your dataloader
test_dataset = TensorDataset(X_test_tensor, y_test_tensor) # create your datset
test_dataloader = DataLoader(test_dataset, batch_size= 1000) # create your dataloader
Here I train my model. The error occurs during the line "outputs = regressor(inputs)"
NUM_EPOCHS = 2000
BATCH_SIZE = 1000
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device used : {device}")
# 1 hidden layer
total_num_nodes = 256
regressor = Net(n_hiddens= [total_num_nodes]).to(device)
optimizer = torch.optim.SGD(regressor.parameters(), lr=0.2, momentum= 0.1, nesterov= True)
loss_func = torch.nn.MSELoss() # this is for regression mean squared loss
for epoch in range(NUM_EPOCHS):
running_loss = 0.0
for i, data in enumerate(train_dataloader, 0):
inputs, values = data
inputs = inputs.float().to(device)
values = values.float().to(device)
optimizer.zero_grad() # clear gradients for next train
print(f"Input device is : cuda:{inputs.get_device()}")
print(f"Target value device is : cuda:{values.get_device()}")
print(f"Is model on cuda ? : {next(regressor.parameters()).is_cuda}")
outputs = regressor(inputs) # <-- This is where I have the error
loss = loss_func(outputs, values)
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
Here are the outputs of my print statements :
Device used : cuda:0
Input device is : cuda:0
Target value device is : cuda:0
Is model on cuda ? :True
This should mean that my model and my tensors are all on the same device so why do I still have this error ?
The error log is :
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-6-5234b830bebc> in <module>()
24 print(f"Target value device is : cuda:{values.get_device()}")
25 print(f"Is model on cuda ? : {next(regressor.parameters()).is_cuda}")
---> 26 outputs = regressor(inputs)
27 loss = loss_func(outputs, values)
28 loss.backward() # backpropagation, compute gradients
4 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
<ipython-input-4-56c54b30b771> in forward(self, x)
16 def forward(self, x):
17 for hidden in self.hiddens :
---> 18 x = F.relu(hidden(x)) # activation function for hidden layer
19 x = self.predict(x) # linear output
20 return x
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/linear.py in forward(self, input)
101
102 def forward(self, input: Tensor) -> Tensor:
--> 103 return F.linear(input, self.weight, self.bias)
104
105 def extra_repr(self) -> str:
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1846 if has_torch_function_variadic(input, weight, bias):
1847 return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias)
-> 1848 return torch._C._nn.linear(input, weight, bias)
1849
1850
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_addmm)
Thank you very much
Solution
TL;DR use nn.ModuleList
instead of a pythonic one to store the hidden layers in Net
.
All your hidden layers are stored in a simple pythonic list self.hidden
in Net
. When you move your model to GPU, using .to(device)
, pytorch has no way to tell that all the elements of this pythonic list should also be moved to the same device.
however, if you make self.hidden = nn.ModuleLis()
, pytorch now knows to treat all elements of this special list as nn.Module
s and recursively move them to the same device as Net
.
See these answers 1, 2, 3 for more details.
Answered By - Shai
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.