nnet.py

   1 # based on https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html
   2 import numpy as np
   3
   4 # use a constant seed to keep things reproducible
   5 rg = np.random.default_rng(1)
   6
   7
   8 class LinearLayer:
   9     '''
  10     ini_type: initialization type for weight parameters: plain, xavier, or he
  11     '''
  12     def __init__(self, input_shape, n_out, ini_type="plain"):
  13         self.m = input_shape[1]  # number of examples in training data
  14
  15         # initialize weights
  16         n_in = input_shape[0]
  17         if ini_type == 'plain':
  18             self.W = rg.standard_normal(size=(n_out, n_in)) * 0.01  # set weights 'W' to small random gaussian
  19         elif ini_type == 'xavier':
  20             self.W = rg.standard_normal(size=(n_out, n_in)) / (np.sqrt(n_in))  # set variance of W to 1/n
  21         elif ini_type == 'he':
  22             # Good when ReLU used in hidden layers
  23             # Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification
  24             # Kaiming He et al. (https://arxiv.org/abs/1502.01852)
  25             # http: // cs231n.github.io / neural - networks - 2 /  # init
  26             self.W = rg.standard_normal(size=(n_out, n_in)) * np.sqrt(2/n_in)  # set variance of W to 2/n
  27
  28         self.b = np.zeros((n_out, 1))
  29         self.Z = np.zeros((self.W.shape[0], input_shape[1]))
  30
  31     def forward(self, A_prev):
  32         self.A_prev = A_prev
  33         self.Z = self.W @ self.A_prev + self.b
  34         return self.Z
  35
  36     def backward(self, upstream_grad):
  37         # derivative of Cost w.r.t W
  38         self.dW = upstream_grad @ self.A_prev.T
  39         # derivative of Cost w.r.t b, sum across rows
  40         self.db = np.sum(upstream_grad, axis=1, keepdims=True)
  41         # derivative of Cost w.r.t A_prev
  42         self.dA_prev = self.W.T @ upstream_grad
  43         return self.dA_prev
  44
  45     def update_params(self, learning_rate=0.1):
  46         self.W -= learning_rate * self.dW
  47         self.b -= learning_rate * self.db
  48
  49
  50 class SigmoidLayer:
  51     def __init__(self, shape):
  52         self.A = np.zeros(shape)
  53
  54     def forward(self, Z):
  55         self.A = 1 / (1 + np.exp(-Z))  # compute activations
  56         return self.A
  57
  58     def backward(self, upstream_grad):
  59         # couple upstream gradient with local gradient, the result will be sent back to the Linear layer
  60         self.dZ = upstream_grad * self.A * (1 - self.A)
  61         return self.dZ
  62
  63     def update_params(self, learning_rate=0.1):
  64         pass
  65
  66
  67 def label_vectors(labels, n):
  68     y = np.zeros((n, labels.size))
  69     for i, l in enumerate(labels):
  70         y[l][i] = 1.0
  71     return y
  72
  73
  74 def forward(layers, X):
  75     assert X.shape[1] == layers[0].m, f'input length {X.shape[1]} does not match first layer width {layers[0].m}'
  76     cur = X
  77     for layer in layers:
  78         cur = layer.forward(cur)
  79     return cur
  80
  81
  82 def classify(y):
  83     # the recognized digit is the index of the highest-valued output neuron
  84     return np.argmax(y, axis=0), np.max(y, axis=0)
  85
  86
  87 def accuracy(layers, X, labels):
  88     '''Count percentage of test inputs which are being recognized correctly'''
  89
  90     assert X.shape[1] == layers[0].m, f'input length {X.shape[1]} does not match first layer width {layers[0].m}'
  91     assert layers[0].m == labels.size, f'first layer width {layers[0].m} does not match number of labels {labels.size}'
  92     output = forward(layers, X)
  93     classes = classify(output)[0]
  94     return 100 * (np.sum(classes == labels) / classes.size)
  95
  96
  97 def cost_sqe(Y, output):
  98     '''
  99     This function computes and returns the Cost and its derivative.
 100     The is function uses the Squared Error Cost function -> (1/2m)*sum(Y - output)^2
 101     Args:
 102         Y: label vectors of data
 103         output: Predictions(activations) from a last layer, the output layer
 104     Returns:
 105         cost: The Squared Error Cost result
 106         dOutput: gradient of Cost w.r.t the output
 107     '''
 108     m = Y.shape[1]
 109
 110     cost = (1 / (2 * m)) * np.sum(np.square(Y - output))
 111     cost = np.squeeze(cost)  # remove extraneous dimensions to give just a scalar
 112
 113     dOutput = -1 / m * (Y - output)  # derivative of the squared error cost function
 114     return cost, dOutput
 115
 116
 117 def train(layers, X, Y, learning_rate=0.1, cost_fn=cost_sqe):
 118     output = forward(layers, X)
 119     cost, dOutput = cost_fn(Y, output)
 120
 121     cur = dOutput
 122     for layer in reversed(layers):
 123         cur = layer.backward(cur)
 124         layer.update_params(learning_rate)
 125
 126     return cost