I want to compare the losses for NumPy and PyTorch implementation with a softmax layer and mean square error loss. Here is my code with 2 hidden layers and final softmax layer (output layer) and MSE loss.
Below is my code for NumPy and PyTorch:
import numpy as np
from copy import deepcopy
np.random.seed(99)
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 16, 100, 10, 2
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
xx = np.random.randn(D_in, H)
yy = np.random.randn(H, D_out)
w1 = deepcopy(xx)
w2 = deepcopy(yy)
def softmax(x):
func = np.exp(x - np.max(x))
return func / func.sum(axis=0)
learning_rate = 1e-4
for t in range(500):
# Forward pass: compute predicted y
h = np.dot(x, w1)
u = np.maximum(h, 0)
z = np.dot(u, w2)
y_pred = softmax(z)
# Compute and print loss
loss = np.square(y_pred - y).sum()
if t % 50 == 0:
print("Epoch [{:3d}/{:3d}] Loss: {:.10f}".format(t, 500, loss))
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_z = y_pred * (1 - y_pred) * grad_y_pred
grad_w2 = np.dot(u.T, grad_z)
grad_u = grad_z.dot(w2.T)
grad_h = grad_u.copy()
grad_h[h < 0] = 0
grad_w1 = x.T.dot(grad_h)
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
print()
# ~~~~~~~~~~~~~torch implemantation~~~~~~~~~~~~~~
import torch
import torch.nn as nn
from torch.autograd import Variable
m = nn.Softmax(dim=0)
x_ = torch.from_numpy(x).type(torch.float64)
y_ = torch.from_numpy(y).type(torch.float64)
w1_ = torch.from_numpy(deepcopy(xx)).type(torch.float64)
w2_ = torch.from_numpy(deepcopy(yy)).type(torch.float64)
w1_ = Variable(w1_, requires_grad=True)
w2_ = Variable(w2_, requires_grad=True)
for t in range(500):
h_ = x_.mm(w1_)
u_ = h_.clamp(min = 0)
z = u_.mm(w2_)
y_pred = m(z)
loss_ = (y_pred - y_).pow(2).sum()
if t % 50 == 0:
print("Epoch [{:3d}/{:3d}] Loss: {:.10f}".format(t, 500, loss_.item()))
loss_.backward()
with torch.no_grad():
w1_ -= learning_rate * w1_.grad
w2_ -= learning_rate * w2_.grad
w1_.grad.zero_()
w2_.grad.zero_()
and here is the output.
Output for NumPy implementation:
Epoch [ 0/500] Loss: 29.7599750339
Epoch [ 50/500] Loss: 29.7568945381
Epoch [100/500] Loss: 29.7530540477
Epoch [150/500] Loss: 29.7481408013
Epoch [200/500] Loss: 29.7416476795
Epoch [250/500] Loss: 29.7326987989
Epoch [300/500] Loss: 29.7196529397
Epoch [350/500] Loss: 29.6990826814
Epoch [400/500] Loss: 29.6626811106
Epoch [450/500] Loss: 29.5857445210
Output for PyTorch implementation:
Epoch [ 0/500] Loss: 29.7599750339
Epoch [ 50/500] Loss: 29.7555138011
Epoch [100/500] Loss: 29.7493562321
Epoch [150/500] Loss: 29.7403440862
Epoch [200/500] Loss: 29.7260022571
Epoch [250/500] Loss: 29.7000586454
Epoch [300/500] Loss: 29.6418893830
Epoch [350/500] Loss: 29.4537482662
Epoch [400/500] Loss: 28.9717000115
Epoch [450/500] Loss: 28.8537886818
But I am not getting a similar output. Am I doing anything wrong while backpropagating in my NumPy implementation? Please help. Thank in advance.