train.py

   1 #!/usr/bin/python3
   2
   3 import numpy as np
   4
   5 import mnist
   6
   7 # use a constant seed to keep things reproducible
   8 rg = np.random.default_rng(1)
   9
  10 # transfer functions
  11
  12 # https://en.wikipedia.org/wiki/Sigmoid_function
  13 # classic, differentiable, apparently worse for training
  14 def sigmoid(x):
  15     return 1 / (1 + np.exp(-x))
  16
  17
  18 def sigmoid_prime(x):
  19     return sigmoid(x) * (1 - sigmoid(x))
  20
  21
  22 # https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
  23 # mostly preferred these days, not differentiable at 0, but slope can be defined arbitrarily as 0 or 1 at 0
  24 def reLU(x):
  25     return np.maximum(x, 0)
  26
  27
  28 def reLU_prime(x):
  29     return np.heaviside(x, 1)
  30
  31
  32 train_images, train_labels, rows, cols = mnist.load('train-images-idx3-ubyte', 'train-labels-idx1-ubyte')
  33 test_images, test_labels, rows2, cols2 = mnist.load('t10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')
  34 assert rows == rows2
  35 assert cols == cols2
  36
  37 # neural network structure: two hidden layers, one output layer
  38 SIZES = (rows * cols, 20, 16, 10)
  39 NUM_LAYERS = len(SIZES)
  40
  41 # initialize weight matrices and bias vectors with random numbers
  42 weights = []
  43 biases = []
  44 for i in range(1, NUM_LAYERS):
  45     weights.append(rg.normal(size=(SIZES[i], SIZES[i-1])))
  46     biases.append(rg.normal(scale=10, size=SIZES[i]))
  47
  48
  49 def feed_forward(x, transfer=sigmoid):
  50     '''Compute all z and output vectors for given input vector'''
  51
  52     a_s = [x]
  53     z_s = []
  54     for w, b in zip(weights, biases):
  55         x = w @ x + b
  56         z_s.append(x)
  57         a_s.append(transfer(x))
  58     return (z_s, a_s)
  59
  60
  61 def classify(y):
  62     # the recognized digit is the index of the highest-valued output neuron
  63     return np.argmax(y), np.max(y)
  64
  65
  66 def cost_grad(x, target_y, transfer=sigmoid, transfer_prime=sigmoid_prime):
  67     '''Return (∂C/∂w, ∂C/∂b) for a particular input and desired output vector'''
  68
  69     # forward pass, remember all z vectors and activations for every layer
  70     z_s, a_s = feed_forward(x, transfer)
  71
  72     # backward pass
  73     deltas = [None] * len(weights)  # delta = dC/dz error for each layer
  74     # insert the last layer error
  75     deltas[-1] = transfer_prime(z_s[-1]) * 2 * (a_s[-1] - target_y)
  76     for i in reversed(range(len(deltas) - 1)):
  77         deltas[i] = (weights[i + 1].T @ deltas[i + 1]) * transfer_prime(z_s[i])
  78
  79     dw = [d @ a_s[i+1] for i, d in enumerate(deltas)]
  80     db = deltas
  81     return dw, db
  82
  83
  84 def label_vector(label):
  85     x = np.zeros(10)
  86     x[label] = 1.0
  87     return x
  88
  89
  90 def backpropagate(image_batch, label_batch, eta):
  91     '''Update NN with gradient descent and backpropagation to a batch of inputs
  92
  93     eta is the learning rate.
  94     '''
  95     global weights, biases
  96
  97     num_images = image_batch.shape[1]
  98     for i in range(num_images):
  99         y = label_vector(label_batch[i])
 100         dws, dbs = cost_grad(image_batch[:, i], y)
 101         weights = [w + eta * dw for w, dw in zip(weights, dws)]
 102         biases = [b + eta * db for b, db in zip(biases, dbs)]
 103
 104
 105 def train(images, labels, eta, batch_size=100):
 106     '''Do backpropagation for smaller batches
 107
 108     This greatly speeds up the learning process, at the expense of finding a more erratic path to the local minimum.
 109     '''
 110     num_images = images.shape[1]
 111     offset = 0
 112     while offset < num_images:
 113         images_batch = images[:, offset:offset + batch_size]
 114         labels_batch = labels[offset:offset + batch_size]
 115         backpropagate(images_batch, labels_batch, eta)
 116         offset += batch_size
 117
 118
 119 def test():
 120     """Count percentage of test inputs which are being recognized correctly"""
 121
 122     good = 0
 123     num_images = test_images.shape[1]
 124     for i in range(num_images):
 125         # the recognized digit is the index of the highest-valued output neuron
 126         y = classify(feed_forward(test_images[:, i])[1][-1])[0]
 127         good += int(y == test_labels[i])
 128     return 100 * (good / num_images)
 129
 130
 131 res = feed_forward(test_images[:, 0])
 132 print(f'output vector of first image: {res[1][-1]}')
 133 digit, conf = classify(res[1][-1])
 134 print(f'classification of first image: {digit} with confidence {conf}; real label {test_labels[0]}')
 135 print(f'correctly recognized images after initialization: {test()}%')
 136
 137 for i in range(1):
 138     print(f"round #{i} of learning...")
 139     train(test_images, test_labels, 1)
 140
 141 print(f'correctly recognized images: {test()}%')