From 1de3cdb5ecba32a8a3b0a02bbf71e883383a689d Mon Sep 17 00:00:00 2001 From: Martin Pitt Date: Sat, 29 Aug 2020 21:29:39 +0200 Subject: [PATCH] Process many images in parallel Provide one object per NN layer and implement their functionality separately, like in https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html Each layer does not take only one image vector, but a whole 10,000 of them, which massively speeds up the computation -- much less time spent in Python iterations. --- README.md | 18 +++++++ nnet.py | 126 ++++++++++++++++++++++++++++++++++++++++++++ train.py | 154 +++++++++--------------------------------------------- 3 files changed, 168 insertions(+), 130 deletions(-) create mode 100644 nnet.py diff --git a/README.md b/README.md index 760002f..3af0686 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ Basics: - [Neuron](https://en.wikipedia.org/wiki/Artificial_neuron) - [Perceptron](https://en.wikipedia.org/wiki/Perceptron) - [Backpropagation](https://en.wikipedia.org/wiki/Backpropagation) + - [Understanding & Creating Neural Networks with Computational Graphs from Scratch](https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html) - [3Blue1Brown video series](https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi) Too high-level for first-time learning, but apparently very abstract and powerful for real-life: @@ -67,3 +68,20 @@ real 0m37.927s user 1m19.103s sys 1m10.169s ``` + + - This is way too slow. I found an [interesting approach](https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html) that harnesses the power of numpy by doing the computations for lots of images in parallel, instead of spending a lot of time in Python on iterating over tens of thousands of examples. Now the accuracy computation takes only negligible time instead of 6 seconds, and each round of training takes less than a second: +``` +$ time ./train.py +output vector of first image: [0.50863223 0.50183558 0.50357349 0.50056673 0.50285531 0.5043152 + 0.51588292 0.49403 0.5030618 0.51006963] +classification of first image: 6 with confidence 0.5158829224337754; real label 7 +correctly recognized images after initialization: 9.58% +cost after training round 0: 1.0462266880961681 +[...] +cost after training round 99: 0.4499245817840479 +correctly recognized images after training: 11.35% + +real 1m51.520s +user 4m23.863s +sys 2m31.686s +``` diff --git a/nnet.py b/nnet.py new file mode 100644 index 0000000..72d6783 --- /dev/null +++ b/nnet.py @@ -0,0 +1,126 @@ +# based on https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html +import numpy as np + +# use a constant seed to keep things reproducible +rg = np.random.default_rng(1) + + +class LinearLayer: + ''' + ini_type: initialization type for weight parameters: plain, xavier, or he + ''' + def __init__(self, input_shape, n_out, ini_type="plain"): + self.m = input_shape[1] # number of examples in training data + + # initialize weights + n_in = input_shape[0] + if ini_type == 'plain': + self.W = rg.standard_normal(size=(n_out, n_in)) * 0.01 # set weights 'W' to small random gaussian + elif ini_type == 'xavier': + self.W = rg.standard_normal(size=(n_out, n_in)) / (np.sqrt(n_in)) # set variance of W to 1/n + elif ini_type == 'he': + # Good when ReLU used in hidden layers + # Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification + # Kaiming He et al. (https://arxiv.org/abs/1502.01852) + # http: // cs231n.github.io / neural - networks - 2 / # init + self.W = rg.standard_normal(size=(n_out, n_in)) * np.sqrt(2/n_in) # set variance of W to 2/n + + self.b = np.zeros((n_out, 1)) + self.Z = np.zeros((self.W.shape[0], input_shape[1])) + + def forward(self, A_prev): + self.A_prev = A_prev + self.Z = self.W @ self.A_prev + self.b + return self.Z + + def backward(self, upstream_grad): + # derivative of Cost w.r.t W + self.dW = upstream_grad @ self.A_prev.T + # derivative of Cost w.r.t b, sum across rows + self.db = np.sum(upstream_grad, axis=1, keepdims=True) + # derivative of Cost w.r.t A_prev + self.dA_prev = self.W.T @ upstream_grad + return self.dA_prev + + def update_params(self, learning_rate=0.1): + self.W -= learning_rate * self.dW + self.b -= learning_rate * self.db + + +class SigmoidLayer: + def __init__(self, shape): + self.A = np.zeros(shape) + + def forward(self, Z): + self.A = 1 / (1 + np.exp(-Z)) # compute activations + return self.A + + def backward(self, upstream_grad): + # couple upstream gradient with local gradient, the result will be sent back to the Linear layer + self.dZ = upstream_grad * self.A * (1 - self.A) + return self.dZ + + def update_params(self, learning_rate=0.1): + pass + + +def label_vectors(labels, n): + y = np.zeros((n, labels.size)) + for i, l in enumerate(labels): + y[l][i] = 1.0 + return y + + +def forward(layers, X): + assert X.shape[1] == layers[0].m, f'input length {X.shape[1]} does not match first layer width {layers[0].m}' + cur = X + for layer in layers: + cur = layer.forward(cur) + return cur + + +def classify(y): + # the recognized digit is the index of the highest-valued output neuron + return np.argmax(y, axis=0), np.max(y, axis=0) + + +def accuracy(layers, X, labels): + '''Count percentage of test inputs which are being recognized correctly''' + + assert X.shape[1] == layers[0].m, f'input length {X.shape[1]} does not match first layer width {layers[0].m}' + assert layers[0].m == labels.size, f'first layer width {layers[0].m} does not match number of labels {labels.size}' + output = forward(layers, X) + classes = classify(output)[0] + return 100 * (np.sum(classes == labels) / classes.size) + + +def cost_sqe(Y, output): + ''' + This function computes and returns the Cost and its derivative. + The is function uses the Squared Error Cost function -> (1/2m)*sum(Y - output)^2 + Args: + Y: label vectors of data + output: Predictions(activations) from a last layer, the output layer + Returns: + cost: The Squared Error Cost result + dOutput: gradient of Cost w.r.t the output + ''' + m = Y.shape[1] + + cost = (1 / (2 * m)) * np.sum(np.square(Y - output)) + cost = np.squeeze(cost) # remove extraneous dimensions to give just a scalar + + dOutput = -1 / m * (Y - output) # derivative of the squared error cost function + return cost, dOutput + + +def train(layers, X, Y, learning_rate=0.1, cost_fn=cost_sqe): + output = forward(layers, X) + cost, dOutput = cost_fn(Y, output) + + cur = dOutput + for layer in reversed(layers): + cur = layer.backward(cur) + layer.update_params(learning_rate) + + return cost diff --git a/train.py b/train.py index b29effa..012e2c7 100755 --- a/train.py +++ b/train.py @@ -1,141 +1,35 @@ #!/usr/bin/python3 -import numpy as np - import mnist - -# use a constant seed to keep things reproducible -rg = np.random.default_rng(1) - -# transfer functions - -# https://en.wikipedia.org/wiki/Sigmoid_function -# classic, differentiable, apparently worse for training -def sigmoid(x): - return 1 / (1 + np.exp(-x)) - - -def sigmoid_prime(x): - return sigmoid(x) * (1 - sigmoid(x)) - - -# https://en.wikipedia.org/wiki/Rectifier_(neural_networks) -# mostly preferred these days, not differentiable at 0, but slope can be defined arbitrarily as 0 or 1 at 0 -def reLU(x): - return np.maximum(x, 0) - - -def reLU_prime(x): - return np.heaviside(x, 1) - +import nnet train_images, train_labels, rows, cols = mnist.load('train-images-idx3-ubyte', 'train-labels-idx1-ubyte') test_images, test_labels, rows2, cols2 = mnist.load('t10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte') assert rows == rows2 assert cols == cols2 +num_train = train_images.shape[1] +nnet_batch = 10000 # neural network structure: two hidden layers, one output layer -SIZES = (rows * cols, 20, 16, 10) -NUM_LAYERS = len(SIZES) - -# initialize weight matrices and bias vectors with random numbers -weights = [] -biases = [] -for i in range(1, NUM_LAYERS): - weights.append(rg.normal(size=(SIZES[i], SIZES[i-1]))) - biases.append(rg.normal(scale=10, size=SIZES[i])) - - -def feed_forward(x, transfer=sigmoid): - '''Compute all z and output vectors for given input vector''' - - a_s = [x] - z_s = [] - for w, b in zip(weights, biases): - x = w @ x + b - z_s.append(x) - a_s.append(transfer(x)) - return (z_s, a_s) - - -def classify(y): - # the recognized digit is the index of the highest-valued output neuron - return np.argmax(y), np.max(y) - - -def cost_grad(x, target_y, transfer=sigmoid, transfer_prime=sigmoid_prime): - '''Return (∂C/∂w, ∂C/∂b) for a particular input and desired output vector''' - - # forward pass, remember all z vectors and activations for every layer - z_s, a_s = feed_forward(x, transfer) - - # backward pass - deltas = [None] * len(weights) # delta = dC/dz error for each layer - # insert the last layer error - deltas[-1] = transfer_prime(z_s[-1]) * 2 * (a_s[-1] - target_y) - for i in reversed(range(len(deltas) - 1)): - deltas[i] = (weights[i + 1].T @ deltas[i + 1]) * transfer_prime(z_s[i]) - - dw = [d @ a_s[i+1] for i, d in enumerate(deltas)] - db = deltas - return dw, db - - -def label_vector(label): - x = np.zeros(10) - x[label] = 1.0 - return x - - -def backpropagate(image_batch, label_batch, eta): - '''Update NN with gradient descent and backpropagation to a batch of inputs - - eta is the learning rate. - ''' - global weights, biases - - num_images = image_batch.shape[1] - for i in range(num_images): - y = label_vector(label_batch[i]) - dws, dbs = cost_grad(image_batch[:, i], y) - weights = [w + eta * dw for w, dw in zip(weights, dws)] - biases = [b + eta * db for b, db in zip(biases, dbs)] - - -def train(images, labels, eta, batch_size=100): - '''Do backpropagation for smaller batches - - This greatly speeds up the learning process, at the expense of finding a more erratic path to the local minimum. - ''' - num_images = images.shape[1] - offset = 0 - while offset < num_images: - images_batch = images[:, offset:offset + batch_size] - labels_batch = labels[offset:offset + batch_size] - backpropagate(images_batch, labels_batch, eta) - offset += batch_size - - -def test(): - """Count percentage of test inputs which are being recognized correctly""" - - good = 0 - num_images = test_images.shape[1] - for i in range(num_images): - # the recognized digit is the index of the highest-valued output neuron - y = classify(feed_forward(test_images[:, i])[1][-1])[0] - good += int(y == test_labels[i]) - return 100 * (good / num_images) - - -res = feed_forward(test_images[:, 0]) -print(f'output vector of first image: {res[1][-1]}') -digit, conf = classify(res[1][-1]) +# (input)--> [Linear->Sigmoid] -> [Linear->Sigmoid] -->(output) +# handle 10,000 vectors at a time +Z1 = nnet.LinearLayer(input_shape=(rows * cols, nnet_batch), n_out=20) +A1 = nnet.SigmoidLayer(Z1.Z.shape) +Z2 = nnet.LinearLayer(input_shape=A1.A.shape, n_out=16) +A2 = nnet.SigmoidLayer(Z2.Z.shape) +ZO = nnet.LinearLayer(input_shape=A2.A.shape, n_out=10) +AO = nnet.SigmoidLayer(ZO.Z.shape) +net = (Z1, A1, Z2, A2, ZO, AO) + +res = nnet.forward(net, train_images[:, 0:10000]) +print(f'output vector of first image: {res[:, 0]}') +digit, conf = nnet.classify(res[:, 0]) print(f'classification of first image: {digit} with confidence {conf}; real label {test_labels[0]}') -print(f'correctly recognized images after initialization: {test()}%') - -for i in range(1): - print(f"round #{i} of learning...") - train(test_images, test_labels, 1) - -print(f'correctly recognized images: {test()}%') +print(f'correctly recognized images after initialization: {nnet.accuracy(net, test_images, test_labels)}%') + +train_y = nnet.label_vectors(train_labels, 10) +for i in range(100): + for batch in range(0, num_train, nnet_batch): + cost = nnet.train(net, train_images[:, batch:(batch + nnet_batch)], train_y[:, batch:(batch + nnet_batch)]) + print(f'cost after training round {i}: {cost}') +print(f'correctly recognized images after training: {nnet.accuracy(net, test_images, test_labels)}%') -- 2.39.2