From 1de3cdb5ecba32a8a3b0a02bbf71e883383a689d Mon Sep 17 00:00:00 2001
From: Martin Pitt <martin@piware.de>
Date: Sat, 29 Aug 2020 21:29:39 +0200
Subject: [PATCH] Process many images in parallel

Provide one object per NN layer and implement their functionality
separately, like in
https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html

Each layer does not take only one image vector, but a whole 10,000 of
them, which massively speeds up the computation -- much less time spent
in Python iterations.
---
 README.md |  18 +++++++
 nnet.py   | 126 ++++++++++++++++++++++++++++++++++++++++++++
 train.py  | 154 +++++++++---------------------------------------------
 3 files changed, 168 insertions(+), 130 deletions(-)
 create mode 100644 nnet.py

diff --git a/README.md b/README.md
index 760002f..3af0686 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ Basics:
  - [Neuron](https://en.wikipedia.org/wiki/Artificial_neuron)
  - [Perceptron](https://en.wikipedia.org/wiki/Perceptron)
  - [Backpropagation](https://en.wikipedia.org/wiki/Backpropagation)
+ - [Understanding & Creating Neural Networks with Computational Graphs from Scratch](https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html)
  - [3Blue1Brown video series](https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi)
 
 Too high-level for first-time learning, but apparently very abstract and powerful for real-life:
@@ -67,3 +68,20 @@ real	0m37.927s
 user	1m19.103s
 sys	1m10.169s
 ```
+
+ - This is way too slow. I found an [interesting approach](https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html) that harnesses the power of numpy by doing the computations for lots of images in parallel, instead of spending a lot of time in Python on iterating over tens of thousands of examples. Now the accuracy computation takes only negligible time instead of 6 seconds, and each round of training takes less than a second:
+```
+$ time ./train.py
+output vector of first image: [0.50863223 0.50183558 0.50357349 0.50056673 0.50285531 0.5043152
+ 0.51588292 0.49403    0.5030618  0.51006963]
+classification of first image: 6 with confidence 0.5158829224337754; real label 7
+correctly recognized images after initialization: 9.58%
+cost after training round 0: 1.0462266880961681
+[...]
+cost after training round 99: 0.4499245817840479
+correctly recognized images after training: 11.35%
+
+real	1m51.520s
+user	4m23.863s
+sys	2m31.686s
+```
diff --git a/nnet.py b/nnet.py
new file mode 100644
index 0000000..72d6783
--- /dev/null
+++ b/nnet.py
@@ -0,0 +1,126 @@
+# based on https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html
+import numpy as np
+
+# use a constant seed to keep things reproducible
+rg = np.random.default_rng(1)
+
+
+class LinearLayer:
+    '''
+    ini_type: initialization type for weight parameters: plain, xavier, or he
+    '''
+    def __init__(self, input_shape, n_out, ini_type="plain"):
+        self.m = input_shape[1]  # number of examples in training data
+
+        # initialize weights
+        n_in = input_shape[0]
+        if ini_type == 'plain':
+            self.W = rg.standard_normal(size=(n_out, n_in)) * 0.01  # set weights 'W' to small random gaussian
+        elif ini_type == 'xavier':
+            self.W = rg.standard_normal(size=(n_out, n_in)) / (np.sqrt(n_in))  # set variance of W to 1/n
+        elif ini_type == 'he':
+            # Good when ReLU used in hidden layers
+            # Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification
+            # Kaiming He et al. (https://arxiv.org/abs/1502.01852)
+            # http: // cs231n.github.io / neural - networks - 2 /  # init
+            self.W = rg.standard_normal(size=(n_out, n_in)) * np.sqrt(2/n_in)  # set variance of W to 2/n
+
+        self.b = np.zeros((n_out, 1))
+        self.Z = np.zeros((self.W.shape[0], input_shape[1]))
+
+    def forward(self, A_prev):
+        self.A_prev = A_prev
+        self.Z = self.W @ self.A_prev + self.b
+        return self.Z
+
+    def backward(self, upstream_grad):
+        # derivative of Cost w.r.t W
+        self.dW = upstream_grad @ self.A_prev.T
+        # derivative of Cost w.r.t b, sum across rows
+        self.db = np.sum(upstream_grad, axis=1, keepdims=True)
+        # derivative of Cost w.r.t A_prev
+        self.dA_prev = self.W.T @ upstream_grad
+        return self.dA_prev
+
+    def update_params(self, learning_rate=0.1):
+        self.W -= learning_rate * self.dW
+        self.b -= learning_rate * self.db
+
+
+class SigmoidLayer:
+    def __init__(self, shape):
+        self.A = np.zeros(shape)
+
+    def forward(self, Z):
+        self.A = 1 / (1 + np.exp(-Z))  # compute activations
+        return self.A
+
+    def backward(self, upstream_grad):
+        # couple upstream gradient with local gradient, the result will be sent back to the Linear layer
+        self.dZ = upstream_grad * self.A * (1 - self.A)
+        return self.dZ
+
+    def update_params(self, learning_rate=0.1):
+        pass
+
+
+def label_vectors(labels, n):
+    y = np.zeros((n, labels.size))
+    for i, l in enumerate(labels):
+        y[l][i] = 1.0
+    return y
+
+
+def forward(layers, X):
+    assert X.shape[1] == layers[0].m, f'input length {X.shape[1]} does not match first layer width {layers[0].m}'
+    cur = X
+    for layer in layers:
+        cur = layer.forward(cur)
+    return cur
+
+
+def classify(y):
+    # the recognized digit is the index of the highest-valued output neuron
+    return np.argmax(y, axis=0), np.max(y, axis=0)
+
+
+def accuracy(layers, X, labels):
+    '''Count percentage of test inputs which are being recognized correctly'''
+
+    assert X.shape[1] == layers[0].m, f'input length {X.shape[1]} does not match first layer width {layers[0].m}'
+    assert layers[0].m == labels.size, f'first layer width {layers[0].m} does not match number of labels {labels.size}'
+    output = forward(layers, X)
+    classes = classify(output)[0]
+    return 100 * (np.sum(classes == labels) / classes.size)
+
+
+def cost_sqe(Y, output):
+    '''
+    This function computes and returns the Cost and its derivative.
+    The is function uses the Squared Error Cost function -> (1/2m)*sum(Y - output)^2
+    Args:
+        Y: label vectors of data
+        output: Predictions(activations) from a last layer, the output layer
+    Returns:
+        cost: The Squared Error Cost result
+        dOutput: gradient of Cost w.r.t the output
+    '''
+    m = Y.shape[1]
+
+    cost = (1 / (2 * m)) * np.sum(np.square(Y - output))
+    cost = np.squeeze(cost)  # remove extraneous dimensions to give just a scalar
+
+    dOutput = -1 / m * (Y - output)  # derivative of the squared error cost function
+    return cost, dOutput
+
+
+def train(layers, X, Y, learning_rate=0.1, cost_fn=cost_sqe):
+    output = forward(layers, X)
+    cost, dOutput = cost_fn(Y, output)
+
+    cur = dOutput
+    for layer in reversed(layers):
+        cur = layer.backward(cur)
+        layer.update_params(learning_rate)
+
+    return cost
diff --git a/train.py b/train.py
index b29effa..012e2c7 100755
--- a/train.py
+++ b/train.py
@@ -1,141 +1,35 @@
 #!/usr/bin/python3
 
-import numpy as np
-
 import mnist
-
-# use a constant seed to keep things reproducible
-rg = np.random.default_rng(1)
-
-# transfer functions
-
-# https://en.wikipedia.org/wiki/Sigmoid_function
-# classic, differentiable, apparently worse for training
-def sigmoid(x):
-    return 1 / (1 + np.exp(-x))
-
-
-def sigmoid_prime(x):
-    return sigmoid(x) * (1 - sigmoid(x))
-
-
-# https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
-# mostly preferred these days, not differentiable at 0, but slope can be defined arbitrarily as 0 or 1 at 0
-def reLU(x):
-    return np.maximum(x, 0)
-
-
-def reLU_prime(x):
-    return np.heaviside(x, 1)
-
+import nnet
 
 train_images, train_labels, rows, cols = mnist.load('train-images-idx3-ubyte', 'train-labels-idx1-ubyte')
 test_images, test_labels, rows2, cols2 = mnist.load('t10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')
 assert rows == rows2
 assert cols == cols2
+num_train = train_images.shape[1]
+nnet_batch = 10000
 
 # neural network structure: two hidden layers, one output layer
-SIZES = (rows * cols, 20, 16, 10)
-NUM_LAYERS = len(SIZES)
-
-# initialize weight matrices and bias vectors with random numbers
-weights = []
-biases = []
-for i in range(1, NUM_LAYERS):
-    weights.append(rg.normal(size=(SIZES[i], SIZES[i-1])))
-    biases.append(rg.normal(scale=10, size=SIZES[i]))
-
-
-def feed_forward(x, transfer=sigmoid):
-    '''Compute all z and output vectors for given input vector'''
-
-    a_s = [x]
-    z_s = []
-    for w, b in zip(weights, biases):
-        x = w @ x + b
-        z_s.append(x)
-        a_s.append(transfer(x))
-    return (z_s, a_s)
-
-
-def classify(y):
-    # the recognized digit is the index of the highest-valued output neuron
-    return np.argmax(y), np.max(y)
-
-
-def cost_grad(x, target_y, transfer=sigmoid, transfer_prime=sigmoid_prime):
-    '''Return (âC/âw, âC/âb) for a particular input and desired output vector'''
-
-    # forward pass, remember all z vectors and activations for every layer
-    z_s, a_s = feed_forward(x, transfer)
-
-    # backward pass
-    deltas = [None] * len(weights)  # delta = dC/dz error for each layer
-    # insert the last layer error
-    deltas[-1] = transfer_prime(z_s[-1]) * 2 * (a_s[-1] - target_y)
-    for i in reversed(range(len(deltas) - 1)):
-        deltas[i] = (weights[i + 1].T @ deltas[i + 1]) * transfer_prime(z_s[i])
-
-    dw = [d @ a_s[i+1] for i, d in enumerate(deltas)]
-    db = deltas
-    return dw, db
-
-
-def label_vector(label):
-    x = np.zeros(10)
-    x[label] = 1.0
-    return x
-
-
-def backpropagate(image_batch, label_batch, eta):
-    '''Update NN with gradient descent and backpropagation to a batch of inputs
-
-    eta is the learning rate.
-    '''
-    global weights, biases
-
-    num_images = image_batch.shape[1]
-    for i in range(num_images):
-        y = label_vector(label_batch[i])
-        dws, dbs = cost_grad(image_batch[:, i], y)
-        weights = [w + eta * dw for w, dw in zip(weights, dws)]
-        biases = [b + eta * db for b, db in zip(biases, dbs)]
-
-
-def train(images, labels, eta, batch_size=100):
-    '''Do backpropagation for smaller batches
-
-    This greatly speeds up the learning process, at the expense of finding a more erratic path to the local minimum.
-    '''
-    num_images = images.shape[1]
-    offset = 0
-    while offset < num_images:
-        images_batch = images[:, offset:offset + batch_size]
-        labels_batch = labels[offset:offset + batch_size]
-        backpropagate(images_batch, labels_batch, eta)
-        offset += batch_size
-
-
-def test():
-    """Count percentage of test inputs which are being recognized correctly"""
-
-    good = 0
-    num_images = test_images.shape[1]
-    for i in range(num_images):
-        # the recognized digit is the index of the highest-valued output neuron
-        y = classify(feed_forward(test_images[:, i])[1][-1])[0]
-        good += int(y == test_labels[i])
-    return 100 * (good / num_images)
-
-
-res = feed_forward(test_images[:, 0])
-print(f'output vector of first image: {res[1][-1]}')
-digit, conf = classify(res[1][-1])
+#                   (input)--> [Linear->Sigmoid] -> [Linear->Sigmoid] -->(output)
+# handle 10,000 vectors at a time
+Z1 = nnet.LinearLayer(input_shape=(rows * cols, nnet_batch), n_out=20)
+A1 = nnet.SigmoidLayer(Z1.Z.shape)
+Z2 = nnet.LinearLayer(input_shape=A1.A.shape, n_out=16)
+A2 = nnet.SigmoidLayer(Z2.Z.shape)
+ZO = nnet.LinearLayer(input_shape=A2.A.shape, n_out=10)
+AO = nnet.SigmoidLayer(ZO.Z.shape)
+net = (Z1, A1, Z2, A2, ZO, AO)
+
+res = nnet.forward(net, train_images[:, 0:10000])
+print(f'output vector of first image: {res[:, 0]}')
+digit, conf = nnet.classify(res[:, 0])
 print(f'classification of first image: {digit} with confidence {conf}; real label {test_labels[0]}')
-print(f'correctly recognized images after initialization: {test()}%')
-
-for i in range(1):
-    print(f"round #{i} of learning...")
-    train(test_images, test_labels, 1)
-
-print(f'correctly recognized images: {test()}%')
+print(f'correctly recognized images after initialization: {nnet.accuracy(net, test_images, test_labels)}%')
+
+train_y = nnet.label_vectors(train_labels, 10)
+for i in range(100):
+    for batch in range(0, num_train, nnet_batch):
+        cost = nnet.train(net, train_images[:, batch:(batch + nnet_batch)], train_y[:, batch:(batch + nnet_batch)])
+    print(f'cost after training round {i}: {cost}')
+print(f'correctly recognized images after training: {nnet.accuracy(net, test_images, test_labels)}%')
-- 
2.47.2