Issue
I am coding a dqn from scratch and therefore have written my loss function. While calling backward on my loss function, I get the following error - RuntimeError: grad can be implicitly created only for scalar outputs
Here's my code -
import numpy as np
import gym
import matplotlib.pyplot as plt
import os
import torch
import random
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from collections import deque
import sys
env = gym.make("CliffWalking-v0")
# In[103]:
#Hyperparameters
episodes = 5000
eps = 1.0
learning_rate = 0.1
discount_factor = 0.99
tot_rewards = []
decay_val = 0.001
mem_size = 50000
batch_size = 2
gamma = 0.99
# In[104]:
class NeuralNetwork(nn.Module):
def __init__(self, state_size, action_size):
super(NeuralNetwork, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.linear_relu_stack = nn.Sequential(
nn.Linear(1, 30),
nn.ReLU(),
nn.Linear(30, 30),
nn.ReLU(),
nn.Linear(30, action_size)
)
def forward(self, x):
x = self.linear_relu_stack(x)
return x
# In[105]:
model = NeuralNetwork(env.observation_space.n, env.action_space.n)
opt = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
loss = nn.MSELoss()
replay_buffer = deque(maxlen=mem_size)
# In[106]:
state = torch.tensor(env.reset(), dtype=torch.float32)
state = state.unsqueeze(dim=0)
print(state.shape)
out = model(state)
# In[111]:
def compute_td_loss(batch_size):
state, next_state, reward, done, action = zip(*random.sample(replay_buffer, batch_size))
state = torch.from_numpy(np.array(state).reshape(-1, 1)).unsqueeze(dim = 0).type(torch.float32)
next_state = torch.from_numpy(np.array(next_state).reshape(-1, 1)).unsqueeze(dim = 0).type(torch.float32)
reward = torch.from_numpy(np.array(reward))
done = torch.from_numpy(np.array(done))
action = torch.from_numpy(np.array(action)).type(torch.int64)
q_values = model(state)
next_q_values = model(next_state)
q_vals = q_values.squeeze().gather(dim=-1, index=action.reshape(-1,1)).reshape(1, -1)
max_next_q_values = torch.max(next_q_values,2)[0].detach()
print("q_vals = ", q_vals)
print("max_next_q_values = ", max_next_q_values)
loss = 0.5*(reward + gamma*max_next_q_values - q_vals)**2
print("reward = ", reward)
print("loss = ", loss)
opt.zero_grad()
loss.backward()
opt.step()
return loss
# In[112]:
for i in range(episodes):
state = env.reset()
done = False
steps = 0
eps_rew = 0
while not done and steps<50:
if np.random.uniform(0,1)<eps:
action = env.action_space.sample()
else:
state = torch.tensor(state, dtype=torch.float32)
state = state.unsqueeze(dim=0)
action = np.argmax(model(state).detach().numpy())
next_state, reward, done, info = env.step(action)
replay_buffer.append((state, next_state, reward, done, action))
if len(replay_buffer)>batch_size:
loss = compute_td_loss(batch_size)
sys.exit()
eps = eps/(1 + 0.001)
eps_rew += reward
if done:
break
state = next_state
tot_rewards.append(eps_rew)
Here's the error that I get -
RuntimeError Traceback (most recent call last)
<ipython-input-112-015fd74c95d9> in <module>
14 replay_buffer.append((state, next_state, reward, done, action))
15 if len(replay_buffer)>batch_size:
---> 16 loss = compute_td_loss(batch_size)
17 sys.exit()
18 eps = eps/(1 + 0.001)
<ipython-input-111-3e1e02c32b4f> in compute_td_loss(batch_size)
16 print("loss = ", loss)
17 opt.zero_grad()
---> 18 loss.backward()
19 opt.step()
20 return loss
c:\users\thoma\anaconda3\envs\custom_atari_env\lib\site-packages\torch\_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
253 create_graph=create_graph,
254 inputs=inputs)
--> 255 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
256
257 def register_hook(self, hook):
c:\users\thoma\anaconda3\envs\custom_atari_env\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
141
142 grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
--> 143 grad_tensors_ = _make_grads(tensors, grad_tensors_)
144 if retain_graph is None:
145 retain_graph = create_graph
c:\users\thoma\anaconda3\envs\custom_atari_env\lib\site-packages\torch\autograd\__init__.py in _make_grads(outputs, grads)
48 if out.requires_grad:
49 if out.numel() != 1:
---> 50 raise RuntimeError("grad can be implicitly created only for scalar outputs")
51 new_grads.append(torch.ones_like(out, memory_format=torch.preserve_format))
52 else:
RuntimeError: grad can be implicitly created only for scalar outputs
Solution
Given that your batch_size = 2
and looking at your code your loss
will likely be of size batch_size x 1
. Given that what you are likely trying to do is to compute the gradient of expected Q loss you can use monte carlo estimator where instead of computing an expectation we use a mean over a finite sample (here - your batch). Consequently what you are missing is taking a mean of your loss before calling backwards.
Answered By - lejlot
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.