1 # based on https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html
4 # use a constant seed to keep things reproducible
5 rg = np.random.default_rng(1)
10 ini_type: initialization type for weight parameters: plain, xavier, or he
12 def __init__(self, input_shape, n_out, ini_type="plain"):
15 if ini_type == 'plain':
16 self.W = rg.standard_normal(size=(n_out, n_in)) * 0.01 # set weights 'W' to small random gaussian
17 elif ini_type == 'xavier':
18 self.W = rg.standard_normal(size=(n_out, n_in)) / (np.sqrt(n_in)) # set variance of W to 1/n
19 elif ini_type == 'he':
20 # Good when ReLU used in hidden layers
21 # Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification
22 # Kaiming He et al. (https://arxiv.org/abs/1502.01852)
23 # http: // cs231n.github.io / neural - networks - 2 / # init
24 self.W = rg.standard_normal(size=(n_out, n_in)) * np.sqrt(2/n_in) # set variance of W to 2/n
26 self.b = np.zeros((n_out, 1))
27 self.shape = (self.W.shape[0], input_shape[1])
29 def forward(self, A_prev):
31 return self.W @ self.A_prev + self.b
33 def backward(self, upstream_grad, learning_rate=0.1):
34 # derivative of Cost w.r.t W
35 dW = upstream_grad @ self.A_prev.T
36 # derivative of Cost w.r.t b, sum across rows
37 db = np.sum(upstream_grad, axis=1, keepdims=True)
38 # derivative of Cost w.r.t A_prev
39 dA_prev = self.W.T @ upstream_grad
42 self.W -= learning_rate * dW
43 self.b -= learning_rate * db
49 def __init__(self, shape):
53 assert Z.shape == self.shape
54 self.A = 1 / (1 + np.exp(-Z)) # compute activations
57 def backward(self, upstream_grad, learning_rate=0.1):
58 # couple upstream gradient with local gradient, the result will be sent back to the Linear layer
59 return upstream_grad * self.A * (1 - self.A)
63 def __init__(self, shape):
67 assert Z.shape == self.shape
68 self.A = np.maximum(Z, 0)
71 def backward(self, upstream_grad, learning_rate=0.1):
72 # couple upstream gradient with local gradient, the result will be sent back to the Linear layer
73 return upstream_grad * np.heaviside(self.A, 1)
76 def label_vectors(labels, n):
77 y = np.zeros((n, labels.size))
78 for i, l in enumerate(labels):
83 def forward(layers, X):
84 assert X.shape[1] == layers[0].shape[1], f'input length {X.shape[1]} does not match first layer width {layers[0].shape[1]}'
87 cur = layer.forward(cur)
92 # the recognized digit is the index of the highest-valued output neuron
93 return np.argmax(y, axis=0), np.max(y, axis=0)
96 def accuracy(layers, X, labels):
97 '''Count percentage of test inputs which are being recognized correctly'''
99 assert X.shape[1] == layers[0].shape[1], f'input length {X.shape[1]} does not match first layer width {layers[0].shape[1]}'
100 assert layers[0].shape[1] == labels.size, f'first layer width {layers[0].shape[1]} does not match number of labels {labels.size}'
101 output = forward(layers, X)
102 classes = classify(output)[0]
103 return 100 * (np.sum(classes == labels) / classes.size)
106 def cost_sqe(Y, output):
108 This function computes and returns the Cost and its derivative.
109 The is function uses the Squared Error Cost function -> (1/2m)*sum(Y - output)^2
111 Y: label vectors of data
112 output: Predictions(activations) from a last layer, the output layer
114 cost: The Squared Error Cost result
115 dOutput: gradient of Cost w.r.t the output
119 cost = (1 / (2 * m)) * np.sum(np.square(Y - output))
120 cost = np.squeeze(cost) # remove extraneous dimensions to give just a scalar
122 dOutput = -1 / m * (Y - output) # derivative of the squared error cost function
126 def train(layers, X, Y, learning_rate=0.1, cost_fn=cost_sqe):
127 output = forward(layers, X)
128 cost, dOutput = cost_fn(Y, output)
131 for layer in reversed(layers):
132 cur = layer.backward(cur, learning_rate)