--- /dev/null
+# based on https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html
+import numpy as np
+
+# use a constant seed to keep things reproducible
+rg = np.random.default_rng(1)
+
+
+class LinearLayer:
+ '''
+ ini_type: initialization type for weight parameters: plain, xavier, or he
+ '''
+ def __init__(self, input_shape, n_out, ini_type="plain"):
+ self.m = input_shape[1] # number of examples in training data
+
+ # initialize weights
+ n_in = input_shape[0]
+ if ini_type == 'plain':
+ self.W = rg.standard_normal(size=(n_out, n_in)) * 0.01 # set weights 'W' to small random gaussian
+ elif ini_type == 'xavier':
+ self.W = rg.standard_normal(size=(n_out, n_in)) / (np.sqrt(n_in)) # set variance of W to 1/n
+ elif ini_type == 'he':
+ # Good when ReLU used in hidden layers
+ # Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification
+ # Kaiming He et al. (https://arxiv.org/abs/1502.01852)
+ # http: // cs231n.github.io / neural - networks - 2 / # init
+ self.W = rg.standard_normal(size=(n_out, n_in)) * np.sqrt(2/n_in) # set variance of W to 2/n
+
+ self.b = np.zeros((n_out, 1))
+ self.Z = np.zeros((self.W.shape[0], input_shape[1]))
+
+ def forward(self, A_prev):
+ self.A_prev = A_prev
+ self.Z = self.W @ self.A_prev + self.b
+ return self.Z
+
+ def backward(self, upstream_grad):
+ # derivative of Cost w.r.t W
+ self.dW = upstream_grad @ self.A_prev.T
+ # derivative of Cost w.r.t b, sum across rows
+ self.db = np.sum(upstream_grad, axis=1, keepdims=True)
+ # derivative of Cost w.r.t A_prev
+ self.dA_prev = self.W.T @ upstream_grad
+ return self.dA_prev
+
+ def update_params(self, learning_rate=0.1):
+ self.W -= learning_rate * self.dW
+ self.b -= learning_rate * self.db
+
+
+class SigmoidLayer:
+ def __init__(self, shape):
+ self.A = np.zeros(shape)
+
+ def forward(self, Z):
+ self.A = 1 / (1 + np.exp(-Z)) # compute activations
+ return self.A
+
+ def backward(self, upstream_grad):
+ # couple upstream gradient with local gradient, the result will be sent back to the Linear layer
+ self.dZ = upstream_grad * self.A * (1 - self.A)
+ return self.dZ
+
+ def update_params(self, learning_rate=0.1):
+ pass
+
+
+def label_vectors(labels, n):
+ y = np.zeros((n, labels.size))
+ for i, l in enumerate(labels):
+ y[l][i] = 1.0
+ return y
+
+
+def forward(layers, X):
+ assert X.shape[1] == layers[0].m, f'input length {X.shape[1]} does not match first layer width {layers[0].m}'
+ cur = X
+ for layer in layers:
+ cur = layer.forward(cur)
+ return cur
+
+
+def classify(y):
+ # the recognized digit is the index of the highest-valued output neuron
+ return np.argmax(y, axis=0), np.max(y, axis=0)
+
+
+def accuracy(layers, X, labels):
+ '''Count percentage of test inputs which are being recognized correctly'''
+
+ assert X.shape[1] == layers[0].m, f'input length {X.shape[1]} does not match first layer width {layers[0].m}'
+ assert layers[0].m == labels.size, f'first layer width {layers[0].m} does not match number of labels {labels.size}'
+ output = forward(layers, X)
+ classes = classify(output)[0]
+ return 100 * (np.sum(classes == labels) / classes.size)
+
+
+def cost_sqe(Y, output):
+ '''
+ This function computes and returns the Cost and its derivative.
+ The is function uses the Squared Error Cost function -> (1/2m)*sum(Y - output)^2
+ Args:
+ Y: label vectors of data
+ output: Predictions(activations) from a last layer, the output layer
+ Returns:
+ cost: The Squared Error Cost result
+ dOutput: gradient of Cost w.r.t the output
+ '''
+ m = Y.shape[1]
+
+ cost = (1 / (2 * m)) * np.sum(np.square(Y - output))
+ cost = np.squeeze(cost) # remove extraneous dimensions to give just a scalar
+
+ dOutput = -1 / m * (Y - output) # derivative of the squared error cost function
+ return cost, dOutput
+
+
+def train(layers, X, Y, learning_rate=0.1, cost_fn=cost_sqe):
+ output = forward(layers, X)
+ cost, dOutput = cost_fn(Y, output)
+
+ cur = dOutput
+ for layer in reversed(layers):
+ cur = layer.backward(cur)
+ layer.update_params(learning_rate)
+
+ return cost
#!/usr/bin/python3
-import numpy as np
-
import mnist
-
-# use a constant seed to keep things reproducible
-rg = np.random.default_rng(1)
-
-# transfer functions
-
-# https://en.wikipedia.org/wiki/Sigmoid_function
-# classic, differentiable, apparently worse for training
-def sigmoid(x):
- return 1 / (1 + np.exp(-x))
-
-
-def sigmoid_prime(x):
- return sigmoid(x) * (1 - sigmoid(x))
-
-
-# https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
-# mostly preferred these days, not differentiable at 0, but slope can be defined arbitrarily as 0 or 1 at 0
-def reLU(x):
- return np.maximum(x, 0)
-
-
-def reLU_prime(x):
- return np.heaviside(x, 1)
-
+import nnet
train_images, train_labels, rows, cols = mnist.load('train-images-idx3-ubyte', 'train-labels-idx1-ubyte')
test_images, test_labels, rows2, cols2 = mnist.load('t10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')
assert rows == rows2
assert cols == cols2
+num_train = train_images.shape[1]
+nnet_batch = 10000
# neural network structure: two hidden layers, one output layer
-SIZES = (rows * cols, 20, 16, 10)
-NUM_LAYERS = len(SIZES)
-
-# initialize weight matrices and bias vectors with random numbers
-weights = []
-biases = []
-for i in range(1, NUM_LAYERS):
- weights.append(rg.normal(size=(SIZES[i], SIZES[i-1])))
- biases.append(rg.normal(scale=10, size=SIZES[i]))
-
-
-def feed_forward(x, transfer=sigmoid):
- '''Compute all z and output vectors for given input vector'''
-
- a_s = [x]
- z_s = []
- for w, b in zip(weights, biases):
- x = w @ x + b
- z_s.append(x)
- a_s.append(transfer(x))
- return (z_s, a_s)
-
-
-def classify(y):
- # the recognized digit is the index of the highest-valued output neuron
- return np.argmax(y), np.max(y)
-
-
-def cost_grad(x, target_y, transfer=sigmoid, transfer_prime=sigmoid_prime):
- '''Return (∂C/∂w, ∂C/∂b) for a particular input and desired output vector'''
-
- # forward pass, remember all z vectors and activations for every layer
- z_s, a_s = feed_forward(x, transfer)
-
- # backward pass
- deltas = [None] * len(weights) # delta = dC/dz error for each layer
- # insert the last layer error
- deltas[-1] = transfer_prime(z_s[-1]) * 2 * (a_s[-1] - target_y)
- for i in reversed(range(len(deltas) - 1)):
- deltas[i] = (weights[i + 1].T @ deltas[i + 1]) * transfer_prime(z_s[i])
-
- dw = [d @ a_s[i+1] for i, d in enumerate(deltas)]
- db = deltas
- return dw, db
-
-
-def label_vector(label):
- x = np.zeros(10)
- x[label] = 1.0
- return x
-
-
-def backpropagate(image_batch, label_batch, eta):
- '''Update NN with gradient descent and backpropagation to a batch of inputs
-
- eta is the learning rate.
- '''
- global weights, biases
-
- num_images = image_batch.shape[1]
- for i in range(num_images):
- y = label_vector(label_batch[i])
- dws, dbs = cost_grad(image_batch[:, i], y)
- weights = [w + eta * dw for w, dw in zip(weights, dws)]
- biases = [b + eta * db for b, db in zip(biases, dbs)]
-
-
-def train(images, labels, eta, batch_size=100):
- '''Do backpropagation for smaller batches
-
- This greatly speeds up the learning process, at the expense of finding a more erratic path to the local minimum.
- '''
- num_images = images.shape[1]
- offset = 0
- while offset < num_images:
- images_batch = images[:, offset:offset + batch_size]
- labels_batch = labels[offset:offset + batch_size]
- backpropagate(images_batch, labels_batch, eta)
- offset += batch_size
-
-
-def test():
- """Count percentage of test inputs which are being recognized correctly"""
-
- good = 0
- num_images = test_images.shape[1]
- for i in range(num_images):
- # the recognized digit is the index of the highest-valued output neuron
- y = classify(feed_forward(test_images[:, i])[1][-1])[0]
- good += int(y == test_labels[i])
- return 100 * (good / num_images)
-
-
-res = feed_forward(test_images[:, 0])
-print(f'output vector of first image: {res[1][-1]}')
-digit, conf = classify(res[1][-1])
+# (input)--> [Linear->Sigmoid] -> [Linear->Sigmoid] -->(output)
+# handle 10,000 vectors at a time
+Z1 = nnet.LinearLayer(input_shape=(rows * cols, nnet_batch), n_out=20)
+A1 = nnet.SigmoidLayer(Z1.Z.shape)
+Z2 = nnet.LinearLayer(input_shape=A1.A.shape, n_out=16)
+A2 = nnet.SigmoidLayer(Z2.Z.shape)
+ZO = nnet.LinearLayer(input_shape=A2.A.shape, n_out=10)
+AO = nnet.SigmoidLayer(ZO.Z.shape)
+net = (Z1, A1, Z2, A2, ZO, AO)
+
+res = nnet.forward(net, train_images[:, 0:10000])
+print(f'output vector of first image: {res[:, 0]}')
+digit, conf = nnet.classify(res[:, 0])
print(f'classification of first image: {digit} with confidence {conf}; real label {test_labels[0]}')
-print(f'correctly recognized images after initialization: {test()}%')
-
-for i in range(1):
- print(f"round #{i} of learning...")
- train(test_images, test_labels, 1)
-
-print(f'correctly recognized images: {test()}%')
+print(f'correctly recognized images after initialization: {nnet.accuracy(net, test_images, test_labels)}%')
+
+train_y = nnet.label_vectors(train_labels, 10)
+for i in range(100):
+ for batch in range(0, num_train, nnet_batch):
+ cost = nnet.train(net, train_images[:, batch:(batch + nnet_batch)], train_y[:, batch:(batch + nnet_batch)])
+ print(f'cost after training round {i}: {cost}')
+print(f'correctly recognized images after training: {nnet.accuracy(net, test_images, test_labels)}%')