In [None]:
#hide
from fastai.gen_doc.nbdoc import *

# A neural net from the foundations

## A neural net layer from scratch

### Modeling a neuron

### Matrix multiplication from scratch

In [None]:
import torch
from torch import tensor

In [None]:
def matmul(a,b):
 ar,ac = a.shape # n_rows * n_cols
 br,bc = b.shape
 assert ac==br
 c = torch.zeros(ar, bc)
 for i in range(ar):
 for j in range(bc):
 for k in range(ac): c[i,j] += a[i,k] * b[k,j]
 return c

In [None]:
m1 = torch.randn(5,28*28)
m2 = torch.randn(784,10)

In [None]:
%time t1=matmul(m1, m2)

CPU times: user 1.15 s, sys: 4.09 ms, total: 1.15 s
Wall time: 1.15 s


In [None]:
%timeit -n 20 t2=m1@m2

14 µs ± 8.95 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [None]:
a = tensor([10., 6, -4])
b = tensor([2., 8, 7])
a + b

tensor([12., 14., 3.])

In [None]:
a < b

tensor([False, True, True])

In [None]:
(a < b).all(), (a==b).all()

(tensor(False), tensor(False))

In [None]:
(a + b).mean().item()

9.666666984558105

In [None]:
m = tensor([[1., 2, 3], [4,5,6], [7,8,9]])
m*m

tensor([[ 1., 4., 9.],
 [16., 25., 36.],
 [49., 64., 81.]])

In [None]:
n = tensor([[1., 2, 3], [4,5,6]])
m*n

RuntimeError: The size of tensor a (3) must match the size of tensor b (2) at non-singleton dimension 0

In [None]:
def matmul(a,b):
 ar,ac = a.shape
 br,bc = b.shape
 assert ac==br
 c = torch.zeros(ar, bc)
 for i in range(ar):
 for j in range(bc): c[i,j] = (a[i] * b[:,j]).sum()
 return c

In [None]:
%timeit -n 20 t3 = matmul(m1,m2)

1.7 ms ± 88.1 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


### Broadcasting

#### Broadcasting with a scalar

In [None]:
a = tensor([10., 6, -4])
a > 0

tensor([ True, True, False])

In [None]:
m = tensor([[1., 2, 3], [4,5,6], [7,8,9]])
(m - 5) / 2.73

tensor([[-1.4652, -1.0989, -0.7326],
 [-0.3663, 0.0000, 0.3663],
 [ 0.7326, 1.0989, 1.4652]])

#### Broadcasting a vector to a matrix

In [None]:
c = tensor([10.,20,30])
m = tensor([[1., 2, 3], [4,5,6], [7,8,9]])
m.shape,c.shape

(torch.Size([3, 3]), torch.Size([3]))

In [None]:
m + c

tensor([[11., 22., 33.],
 [14., 25., 36.],
 [17., 28., 39.]])

In [None]:
c.expand_as(m)

tensor([[10., 20., 30.],
 [10., 20., 30.],
 [10., 20., 30.]])

In [None]:
t = c.expand_as(m)
t.storage()

 10.0
 20.0
 30.0
[torch.FloatStorage of size 3]

In [None]:
t.stride(), t.shape

((0, 1), torch.Size([3, 3]))

In [None]:
c + m

tensor([[11., 22., 33.],
 [14., 25., 36.],
 [17., 28., 39.]])

In [None]:
c = tensor([10.,20,30])
m = tensor([[1., 2, 3], [4,5,6]])
c+m

tensor([[11., 22., 33.],
 [14., 25., 36.]])

In [None]:
c = tensor([10.,20])
m = tensor([[1., 2, 3], [4,5,6]])
c+m

RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1

In [None]:
c = tensor([10.,20,30])
m = tensor([[1., 2, 3], [4,5,6], [7,8,9]])
c = c.unsqueeze(1)
m.shape,c.shape

(torch.Size([3, 3]), torch.Size([3, 1]))

In [None]:
c+m

tensor([[11., 12., 13.],
 [24., 25., 26.],
 [37., 38., 39.]])

In [None]:
t = c.expand_as(m)
t.storage()

 10.0
 20.0
 30.0
[torch.FloatStorage of size 3]

In [None]:
t.stride(), t.shape

((1, 0), torch.Size([3, 3]))

In [None]:
c = tensor([10.,20,30])
c.shape, c.unsqueeze(0).shape,c.unsqueeze(1).shape

(torch.Size([3]), torch.Size([1, 3]), torch.Size([3, 1]))

In [None]:
c.shape, c[None,:].shape,c[:,None].shape

(torch.Size([3]), torch.Size([1, 3]), torch.Size([3, 1]))

In [None]:
c[None].shape,c[...,None].shape

(torch.Size([1, 3]), torch.Size([3, 1]))

In [None]:
def matmul(a,b):
 ar,ac = a.shape
 br,bc = b.shape
 assert ac==br
 c = torch.zeros(ar, bc)
 for i in range(ar):
# c[i,j] = (a[i,:] * b[:,j]).sum() # previous
 c[i] = (a[i ].unsqueeze(-1) * b).sum(dim=0)
 return c

In [None]:
%timeit -n 20 t4 = matmul(m1,m2)

357 µs ± 7.2 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


#### Broadcasting Rules

### Einstein summation

In [None]:
def matmul(a,b): return torch.einsum('ik,kj->ij', a, b)

In [None]:
%timeit -n 20 t5 = matmul(m1,m2)

68.7 µs ± 4.06 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


## The forward and backward passes

### Defining and initializing a layer

In [None]:
def lin(x, w, b): return x @ w + b

In [None]:
x = torch.randn(200, 100)
y = torch.randn(200)

In [None]:
w1 = torch.randn(100,50)
b1 = torch.zeros(50)
w2 = torch.randn(50,1)
b2 = torch.zeros(1)

In [None]:
l1 = lin(x, w1, b1)
l1.shape

torch.Size([200, 50])

In [None]:
l1.mean(), l1.std()

(tensor(0.0019), tensor(10.1058))

In [None]:
x = torch.randn(200, 100)
for i in range(50): x = x @ torch.randn(100,100)
x[0:5,0:5]

tensor([[nan, nan, nan, nan, nan],
 [nan, nan, nan, nan, nan],
 [nan, nan, nan, nan, nan],
 [nan, nan, nan, nan, nan],
 [nan, nan, nan, nan, nan]])

In [None]:
x = torch.randn(200, 100)
for i in range(50): x = x @ (torch.randn(100,100) * 0.01)
x[0:5,0:5]

tensor([[0., 0., 0., 0., 0.],
 [0., 0., 0., 0., 0.],
 [0., 0., 0., 0., 0.],
 [0., 0., 0., 0., 0.],
 [0., 0., 0., 0., 0.]])

In [None]:
x = torch.randn(200, 100)
for i in range(50): x = x @ (torch.randn(100,100) * 0.1)
x[0:5,0:5]

tensor([[ 0.7554, 0.6167, -0.1757, -1.5662, 0.5644],
 [-0.1987, 0.6292, 0.3283, -1.1538, 0.5416],
 [ 0.6106, 0.2556, -0.0618, -0.9463, 0.4445],
 [ 0.4484, 0.7144, 0.1164, -0.8626, 0.4413],
 [ 0.3463, 0.5930, 0.3375, -0.9486, 0.5643]])

In [None]:
x.std()

tensor(0.7042)

In [None]:
x = torch.randn(200, 100)
y = torch.randn(200)

In [None]:
from math import sqrt
w1 = torch.randn(100,50) / sqrt(100)
b1 = torch.zeros(50)
w2 = torch.randn(50,1) / sqrt(50)
b2 = torch.zeros(1)

In [None]:
l1 = lin(x, w1, b1)
l1.mean(),l1.std()

(tensor(-0.0050), tensor(1.0000))

In [None]:
def relu(x): return x.clamp_min(0.)

In [None]:
l2 = relu(l1)
l2.mean(),l2.std()

(tensor(0.3961), tensor(0.5783))

In [None]:
x = torch.randn(200, 100)
for i in range(50): x = relu(x @ (torch.randn(100,100) * 0.1))
x[0:5,0:5]

tensor([[0.0000e+00, 1.9689e-08, 4.2820e-08, 0.0000e+00, 0.0000e+00],
 [0.0000e+00, 1.6701e-08, 4.3501e-08, 0.0000e+00, 0.0000e+00],
 [0.0000e+00, 1.0976e-08, 3.0411e-08, 0.0000e+00, 0.0000e+00],
 [0.0000e+00, 1.8457e-08, 4.9469e-08, 0.0000e+00, 0.0000e+00],
 [0.0000e+00, 1.9949e-08, 4.1643e-08, 0.0000e+00, 0.0000e+00]])

In [None]:
x = torch.randn(200, 100)
for i in range(50): x = relu(x @ (torch.randn(100,100) * sqrt(2/100)))
x[0:5,0:5]

tensor([[0.2871, 0.0000, 0.0000, 0.0000, 0.0026],
 [0.4546, 0.0000, 0.0000, 0.0000, 0.0015],
 [0.6178, 0.0000, 0.0000, 0.0180, 0.0079],
 [0.3333, 0.0000, 0.0000, 0.0545, 0.0000],
 [0.1940, 0.0000, 0.0000, 0.0000, 0.0096]])

In [None]:
x = torch.randn(200, 100)
y = torch.randn(200)

In [None]:
w1 = torch.randn(100,50) * sqrt(2 / 100)
b1 = torch.zeros(50)
w2 = torch.randn(50,1) * sqrt(2 / 50)
b2 = torch.zeros(1)

In [None]:
l1 = lin(x, w1, b1)
l2 = relu(l1)
l2.mean(), l2.std()

(tensor(0.5661), tensor(0.8339))

In [None]:
def model(x):
 l1 = lin(x, w1, b1)
 l2 = relu(l1)
 l3 = lin(l2, w2, b2)
 return l3

In [None]:
out = model(x)
out.shape

torch.Size([200, 1])

In [None]:
def mse(output, targ): return (output.squeeze(-1) - targ).pow(2).mean()

In [None]:
loss = mse(out, y)

### Gradients and backward pass

In [None]:
def mse_grad(inp, targ): 
 # grad of loss with respect to output of previous layer
 inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [None]:
def relu_grad(inp, out):
 # grad of relu with respect to input activations
 inp.g = (inp>0).float() * out.g

In [None]:
def lin_grad(inp, out, w, b):
 # grad of matmul with respect to input
 inp.g = out.g @ w.t()
 w.g = inp.t() @ out.g
 b.g = out.g.sum(0)

### Sidebar: SymPy

In [None]:
from sympy import symbols,diff
sx,sy = symbols('sx sy')
diff(sx**2, sx)

2*sx

### End sidebar

In [None]:
def forward_and_backward(inp, targ):
 # forward pass:
 l1 = inp @ w1 + b1
 l2 = relu(l1)
 out = l2 @ w2 + b2
 # we don't actually need the loss in backward!
 loss = mse(out, targ)
 
 # backward pass:
 mse_grad(out, targ)
 lin_grad(l2, out, w2, b2)
 relu_grad(l1, l2)
 lin_grad(inp, l1, w1, b1)

### Refactor the model

In [None]:
class Relu():
 def __call__(self, inp):
 self.inp = inp
 self.out = inp.clamp_min(0.)
 return self.out
 
 def backward(self): self.inp.g = (self.inp>0).float() * self.out.g

In [None]:
class Lin():
 def __init__(self, w, b): self.w,self.b = w,b
 
 def __call__(self, inp):
 self.inp = inp
 self.out = inp@self.w + self.b
 return self.out
 
 def backward(self):
 self.inp.g = self.out.g @ self.w.t()
 self.w.g = self.inp.t() @ self.out.g
 self.b.g = self.out.g.sum(0)

In [None]:
class Mse():
 def __call__(self, inp, targ):
 self.inp = inp
 self.targ = targ
 self.out = (inp.squeeze() - targ).pow(2).mean()
 return self.out
 
 def backward(self):
 x = (self.inp.squeeze()-self.targ).unsqueeze(-1)
 self.inp.g = 2.*x/self.targ.shape[0]

In [None]:
class Model():
 def __init__(self, w1, b1, w2, b2):
 self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
 self.loss = Mse()
 
 def __call__(self, x, targ):
 for l in self.layers: x = l(x)
 return self.loss(x, targ)
 
 def backward(self):
 self.loss.backward()
 for l in reversed(self.layers): l.backward()

In [None]:
model = Model(w1, b1, w2, b2)

In [None]:
loss = model(x, y)

In [None]:
model.backward()

### Going to PyTorch

In [None]:
class LayerFunction():
 def __call__(self, *args):
 self.args = args
 self.out = self.forward(*args)
 return self.out
 
 def forward(self): raise Exception('not implemented')
 def bwd(self): raise Exception('not implemented')
 def backward(self): self.bwd(self.out, *self.args)

In [None]:
class Relu(LayerFunction):
 def forward(self, inp): return inp.clamp_min(0.)
 def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [None]:
class Lin(LayerFunction):
 def __init__(self, w, b): self.w,self.b = w,b
 
 def forward(self, inp): return inp@self.w + self.b
 
 def bwd(self, out, inp):
 inp.g = out.g @ self.w.t()
 self.w.g = self.inp.t() @ self.out.g
 self.b.g = out.g.sum(0)

In [None]:
class Mse(LayerFunction):
 def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
 def bwd(self, out, inp, targ): 
 inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [None]:
from torch.autograd import Function

class MyRelu(Function):
 @staticmethod
 def forward(ctx, i):
 result = i.clamp_min(0.)
 ctx.save_for_backward(i)
 return result
 
 @staticmethod
 def backward(ctx, grad_output):
 i, = ctx.saved_tensors
 return grad_output * (i>0).float()

In [None]:
import torch.nn as nn

class LinearLayer(nn.Module):
 def __init__(self, n_in, n_out):
 super().__init__()
 self.weight = nn.Parameter(torch.randn(n_out, n_in) * sqrt(2/n_in))
 self.bias = nn.Parameter(torch.zeros(n_out))
 
 def forward(self, x): return x @ self.weight.t() + self.bias

In [None]:
lin = LinearLayer(10,2)
p1,p2 = lin.parameters()
p1.shape,p2.shape

(torch.Size([2, 10]), torch.Size([2]))

In [None]:
class Model(nn.Module):
 def __init__(self, n_in, nh, n_out):
 super().__init__()
 self.layers = nn.Sequential(
 nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out))
 self.loss = mse
 
 def forward(self, x, targ): return self.loss(self.layers(x).squeeze(), targ)

In [None]:
class Model(Module):
 def __init__(self, n_in, nh, n_out):
 self.layers = nn.Sequential(
 nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out))
 self.loss = mse
 
 def forward(self, x, targ): return self.loss(self.layers(x).squeeze(), targ)

## Things to remember

## Questionnaire

### Further research