【DSW Gallery】Using Numpy to implement convolutional neural network
Using Numpy to implement CNN's forward network and backpropagation
The data set used in this article comes from Kaggle and can be downloaded here
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
data = pd.read_csv('train_numpy.csv')
(2290, 785)
data = np.array(data)
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into dev and training sets
data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255.
data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_,m_train = X_train.shape
In this example, we define a convolutional neural network with only two layers
1. The Activate function chooses Relu, because it can effectively improve the gradient dispersion phenomenon
2. Finally, the output of each neuron will be converted into a number between [0,1] through Softmax
forward pass
back pass
Calculation formula for parameter update
According to the above formula, define the activation function
def init_params():
W1 = np.random.rand(10, 784) - 0.5
b1 = np.random.rand(10, 1) - 0.5
W2 = np.random.rand(10, 10) - 0.5
b2 = np.random.rand(10, 1) - 0.5
return W1, b1, W2, b2
def ReLU(Z):
return np. maximum(Z, 0)
def ReLU_deriv(Z):
return Z > 0
def softmax(Z):
A = np.exp(Z) / sum(np.exp(Z))
return A
Define the function for the forward pass and the function for the backward pass
def forward_prop(W1, b1, W2, b2, X):
Z1 = W1.dot(X) + b1
A1 = ReLU(Z1)
Z2 = W2.dot(A1) + b2
A2 = softmax(Z2)
return Z1, A1, Z2, A2
def one_hot(Y):
one_hot_Y = np.zeros((Y.size, int(Y.max()) + 1)).astype(int)
one_hot_Y[np.arange(Y.size), Y.astype(int)] = 1
one_hot_Y = one_hot_Y.T
return one_hot_Y
def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
one_hot_Y = one_hot(Y)
dZ2 = A2 - one_hot_Y
dW2 = 1 / m * dZ2.dot(A1.T)
db2 = 1 / m * np.sum(dZ2)
dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
dW1 = 1 / m * dZ1.dot(X.T)
db1 = 1 / m * np.sum(dZ1)
return dW1, db1, dW2, db2
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
W1 = W1 - alpha * dW1
b1 = b1 - alpha * db1
W2 = W2 - alpha * dW2
b2 = b2 - alpha * db2
return W1, b1, W2, b2
Functions that define prediction, gradient descent, and compute accuracy
def get_predictions(A2):
return np.argmax(A2, 0)
def get_accuracy(predictions, Y):
print(predictions, Y)
return np.sum(predictions == Y) / Y.size
def gradient_descent(X, Y, alpha, iterations):
W1, b1, W2, b2 = init_params()
for i in range(iterations):
Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
if i % 10 == 0:
print("Iteration: ", i)
predictions = get_predictions(A2)
print(get_accuracy(predictions, Y))
return W1, b1, W2, b2
start training
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.10, 500)
Iteration: 0
[2 2 2 ... 1 3 2] [9. 7. 2. ... 5. 6. 0.]
Iteration: 10
[1 1 0 ... 1 3 6] [9. 7. 2. ... 5. 6. 0.]
Iteration: 20
[1 1 2 ... 4 0 6] [9. 7. 2. ... 5. 6. 0.]
Iteration: 30
[1 1 6 ... 0 0 6] [9. 7. 2. ... 5. 6. 0.]
Iteration: 40
[1 1 6 ... 0 0 6] [9. 7. 2. ... 5. 6. 0.]
Iteration: 50
[1 1 6 ... 0 0 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 60
[1 1 6 ... 0 0 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 70
[1 1 2 ... 0 0 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 80
[5 7 2 ... 0 0 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 90
[5 7 2 ... 0 0 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 100
[5 7 2 ... 0 0 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 110
[5 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 120
[5 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 130
[5 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 140
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 150
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 160
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 170
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 180
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 190
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 200
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 210
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 220
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 230
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 240
[9 7 2 ... 0 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 250
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 260
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 270
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 280
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 290
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 300
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 310
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 320
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 330
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 340
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 350
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 370
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 380
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 390
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 400
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 410
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 420
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 430
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 440
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 450
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 460
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 470
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 480
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
Iteration: 490
[9 7 2 ... 5 6 0] [9. 7. 2. ... 5. 6. 0.]
def make_predictions(X, W1, b1, W2, b2):
_, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
predictions = get_predictions(A2)
return predictions
def test_prediction(index, W1, b1, W2, b2):
current_image = X_train[:, index, None]
prediction = make_predictions(X_train[:, index, None], W1, b1, W2, b2)
label = Y_train[index]
print("Prediction: ", prediction)
print("Label: ", label)
current_image = current_image.reshape((28, 28)) * 255
plt. gray()
plt.imshow(current_image, interpolation='nearest')
plt. show()
Let's choose a few inputs to see if our CNN model can correctly recognize the words in the picture
test_prediction(0, W1, b1, W2, b2)
test_prediction(1, W1, b1, W2, b2)
test_prediction(2, W1, b1, W2, b2)
test_prediction(3, W1, b1, W2, b2)
