Process many images in parallel

[handwriting-recognition.git] / nnet.py
diff --git a/nnet.py b/nnet.py

new file mode 100644 (file)

index 0000000..72d6783
--- /dev/null
+++ b/nnet.py
@@ -0,0 +1,126 @@
+# based on https://www.kdnuggets.com/2019/08/numpy-neural-networks-computational-graphs.html
+import numpy as np
+
+# use a constant seed to keep things reproducible
+rg = np.random.default_rng(1)
+
+
+class LinearLayer:
+    '''
+    ini_type: initialization type for weight parameters: plain, xavier, or he
+    '''
+    def __init__(self, input_shape, n_out, ini_type="plain"):
+        self.m = input_shape[1]  # number of examples in training data
+
+        # initialize weights
+        n_in = input_shape[0]
+        if ini_type == 'plain':
+            self.W = rg.standard_normal(size=(n_out, n_in)) * 0.01  # set weights 'W' to small random gaussian
+        elif ini_type == 'xavier':
+            self.W = rg.standard_normal(size=(n_out, n_in)) / (np.sqrt(n_in))  # set variance of W to 1/n
+        elif ini_type == 'he':
+            # Good when ReLU used in hidden layers
+            # Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification
+            # Kaiming He et al. (https://arxiv.org/abs/1502.01852)
+            # http: // cs231n.github.io / neural - networks - 2 /  # init
+            self.W = rg.standard_normal(size=(n_out, n_in)) * np.sqrt(2/n_in)  # set variance of W to 2/n
+
+        self.b = np.zeros((n_out, 1))
+        self.Z = np.zeros((self.W.shape[0], input_shape[1]))
+
+    def forward(self, A_prev):
+        self.A_prev = A_prev
+        self.Z = self.W @ self.A_prev + self.b
+        return self.Z
+
+    def backward(self, upstream_grad):
+        # derivative of Cost w.r.t W
+        self.dW = upstream_grad @ self.A_prev.T
+        # derivative of Cost w.r.t b, sum across rows
+        self.db = np.sum(upstream_grad, axis=1, keepdims=True)
+        # derivative of Cost w.r.t A_prev
+        self.dA_prev = self.W.T @ upstream_grad
+        return self.dA_prev
+
+    def update_params(self, learning_rate=0.1):
+        self.W -= learning_rate * self.dW
+        self.b -= learning_rate * self.db
+
+
+class SigmoidLayer:
+    def __init__(self, shape):
+        self.A = np.zeros(shape)
+
+    def forward(self, Z):
+        self.A = 1 / (1 + np.exp(-Z))  # compute activations
+        return self.A
+
+    def backward(self, upstream_grad):
+        # couple upstream gradient with local gradient, the result will be sent back to the Linear layer
+        self.dZ = upstream_grad * self.A * (1 - self.A)
+        return self.dZ
+
+    def update_params(self, learning_rate=0.1):
+        pass
+
+
+def label_vectors(labels, n):
+    y = np.zeros((n, labels.size))
+    for i, l in enumerate(labels):
+        y[l][i] = 1.0
+    return y
+
+
+def forward(layers, X):
+    assert X.shape[1] == layers[0].m, f'input length {X.shape[1]} does not match first layer width {layers[0].m}'
+    cur = X
+    for layer in layers:
+        cur = layer.forward(cur)
+    return cur
+
+
+def classify(y):
+    # the recognized digit is the index of the highest-valued output neuron
+    return np.argmax(y, axis=0), np.max(y, axis=0)
+
+
+def accuracy(layers, X, labels):
+    '''Count percentage of test inputs which are being recognized correctly'''
+
+    assert X.shape[1] == layers[0].m, f'input length {X.shape[1]} does not match first layer width {layers[0].m}'
+    assert layers[0].m == labels.size, f'first layer width {layers[0].m} does not match number of labels {labels.size}'
+    output = forward(layers, X)
+    classes = classify(output)[0]
+    return 100 * (np.sum(classes == labels) / classes.size)
+
+
+def cost_sqe(Y, output):
+    '''
+    This function computes and returns the Cost and its derivative.
+    The is function uses the Squared Error Cost function -> (1/2m)*sum(Y - output)^2
+    Args:
+        Y: label vectors of data
+        output: Predictions(activations) from a last layer, the output layer
+    Returns:
+        cost: The Squared Error Cost result
+        dOutput: gradient of Cost w.r.t the output
+    '''
+    m = Y.shape[1]
+
+    cost = (1 / (2 * m)) * np.sum(np.square(Y - output))
+    cost = np.squeeze(cost)  # remove extraneous dimensions to give just a scalar
+
+    dOutput = -1 / m * (Y - output)  # derivative of the squared error cost function
+    return cost, dOutput
+
+
+def train(layers, X, Y, learning_rate=0.1, cost_fn=cost_sqe):
+    output = forward(layers, X)
+    cost, dOutput = cost_fn(Y, output)
+
+    cur = dOutput
+    for layer in reversed(layers):
+        cur = layer.backward(cur)
+        layer.update_params(learning_rate)
+
+    return cost