7 # use a constant seed to keep things reproducible
8 rg = np.random.default_rng(1)
12 # https://en.wikipedia.org/wiki/Sigmoid_function
13 # classic, differentiable, apparently worse for training
15 return 1 / (1 + np.exp(-x))
19 return sigmoid(x) * (1 - sigmoid(x))
22 # https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
23 # mostly preferred these days, not differentiable at 0, but slope can be defined arbitrarily as 0 or 1 at 0
25 return np.maximum(x, 0)
29 return np.heaviside(x, 1)
32 train_images, train_labels, rows, cols = mnist.load('train-images-idx3-ubyte', 'train-labels-idx1-ubyte')
33 test_images, test_labels, rows2, cols2 = mnist.load('t10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')
37 # neural network structure: two hidden layers, one output layer
38 SIZES = (rows * cols, 20, 16, 10)
39 NUM_LAYERS = len(SIZES)
41 # initialize weight matrices and bias vectors with random numbers
44 for i in range(1, NUM_LAYERS):
45 weights.append(rg.normal(size=(SIZES[i], SIZES[i-1])))
46 biases.append(rg.normal(scale=10, size=SIZES[i]))
49 def feed_forward(x, transfer=sigmoid):
50 '''Compute all z and output vectors for given input vector'''
54 for w, b in zip(weights, biases):
57 a_s.append(transfer(x))
62 # the recognized digit is the index of the highest-valued output neuron
63 return np.argmax(y), np.max(y)
66 def cost_grad(x, target_y, transfer=sigmoid, transfer_prime=sigmoid_prime):
67 '''Return (∂C/∂w, ∂C/∂b) for a particular input and desired output vector'''
69 # forward pass, remember all z vectors and activations for every layer
70 z_s, a_s = feed_forward(x, transfer)
73 deltas = [None] * len(weights) # delta = dC/dz error for each layer
74 # insert the last layer error
75 deltas[-1] = transfer_prime(z_s[-1]) * 2 * (a_s[-1] - target_y)
76 for i in reversed(range(len(deltas) - 1)):
77 deltas[i] = (weights[i + 1].T @ deltas[i + 1]) * transfer_prime(z_s[i])
79 dw = [d @ a_s[i+1] for i, d in enumerate(deltas)]
84 def label_vector(label):
90 def backpropagate(image_batch, label_batch, eta):
91 '''Update NN with gradient descent and backpropagation to a batch of inputs
93 eta is the learning rate.
95 global weights, biases
97 num_images = image_batch.shape[1]
98 for i in range(num_images):
99 y = label_vector(label_batch[i])
100 dws, dbs = cost_grad(image_batch[:, i], y)
101 weights = [w + eta * dw for w, dw in zip(weights, dws)]
102 biases = [b + eta * db for b, db in zip(biases, dbs)]
106 """Count percentage of test inputs which are being recognized correctly"""
109 num_images = test_images.shape[1]
110 for i in range(num_images):
111 # the recognized digit is the index of the highest-valued output neuron
112 y = classify(feed_forward(test_images[:, i])[1][-1])[0]
113 good += int(y == test_labels[i])
114 return 100 * (good / num_images)
117 res = feed_forward(test_images[:, 0])
118 print(f'output vector of first image: {res[1][-1]}')
119 digit, conf = classify(res[1][-1])
120 print(f'classification of first image: {digit} with confidence {conf}; real label {test_labels[0]}')
121 print(f'correctly recognized images after initialization: {test()}%')
124 print(f"round #{i} of learning...")
125 backpropagate(test_images, test_labels, 1)
127 print(f'correctly recognized images: {test()}%')