算法简介
反向传播 (Backpropagation) 算法是训练神经网络的核心方法,由 Rumelhart、Hinton 和 Williams 在 1986 年重新发现并推广。它通过链式法则高效计算损失函数对每个权重的梯度,然后使用梯度下降法更新权重。
💡 核心思想
前向传播计算输出 → 计算损失 → 反向传播梯度 → 更新权重
flowchart LR
A[输入层] -->|前向传播 | B[隐藏层]
B -->|前向传播 | C[输出层]
C -->|计算 Loss| D[损失函数]
D -->|计算梯度 | E[反向传播]
E -->|∂L/∂W₃| C
E -->|∂L/∂W₂| B
E -->|∂L/∂W₁| A
subgraph 一次迭代
A
B
C
D
E
end
神经网络结构
4
输入层
→
6
隐藏层 1
→
4
隐藏层 2
→
3
输出层
网络组成要素
| 组件 | 说明 | 数学表示 |
|---|---|---|
| 神经元 | 基本计算单元 | z = Wx + b, a = σ(z) |
| 权重 | 连接强度参数 | W ∈ ℝ^(m×n) |
| 偏置 | 激活阈值 | b ∈ ℝ^n |
| 激活函数 | 引入非线性 | σ(z) |
前向传播 (Forward Propagation)
输入数据从输入层经过各隐藏层,最终到达输出层产生预测结果。
前向传播公式
z^[l] = W^[l] · a^[l-1] + b^[l]
a^[l] = σ(z^[l])
其中 l 表示层号,a^[0] = x 是输入
a^[l] = σ(z^[l])
其中 l 表示层号,a^[0] = x 是输入
Sigmoid
σ(z) = 1 / (1 + e⁻ᶻ)
输出范围 (0,1),适合二分类
ReLU
σ(z) = max(0, z)
最常用,计算简单,缓解梯度消失
Tanh
σ(z) = (eᶻ - e⁻ᶻ) / (eᶻ + e⁻ᶻ)
输出范围 (-1,1),零中心化
Softmax
σ(z)ᵢ = eᶻⁱ / Σⱼeᶻʲ
多分类输出层
反向传播 (Backpropagation)
从输出层开始,利用链式法则逐层计算损失函数对权重和偏置的梯度。
梯度计算流程
1
计算输出层误差
δ^[L] = ∂L/∂z^[L] = a^[L] - y (交叉熵损失)
2
计算输出层梯度
∂L/∂W^[L] = δ^[L] · (a^[L-1])ᵀ / m
3
反向传播误差
δ^[l] = ((W^[l+1])ᵀ · δ^[l+1]) ⊙ σ'(z^[l])
4
计算隐藏层梯度
∂L/∂W^[l] = δ^[l] · (a^[l-1])ᵀ / m
5
更新权重
W^[l] = W^[l] - α · ∂L/∂W^[l]
梯度推导详解
损失函数 (交叉熵):
L = -Σᵢ yᵢ log(aᵢ^[L])
L = -Σᵢ yᵢ log(aᵢ^[L])
输出层误差:
δ^[L] = ∂L/∂z^[L] = ∂L/∂a^[L] · ∂a^[L]/∂z^[L]
= -y/a^[L] · a^[L](1-a^[L])
= a^[L] - y (简化后)
δ^[L] = ∂L/∂z^[L] = ∂L/∂a^[L] · ∂a^[L]/∂z^[L]
= -y/a^[L] · a^[L](1-a^[L])
= a^[L] - y (简化后)
隐藏层误差 (链式法则):
δ^[l] = ∂L/∂z^[l]
= (∂z^[l+1]/∂z^[l])ᵀ · ∂L/∂z^[l+1]
= ((W^[l+1])ᵀ · δ^[l+1]) ⊙ σ'(z^[l])
δ^[l] = ∂L/∂z^[l]
= (∂z^[l+1]/∂z^[l])ᵀ · ∂L/∂z^[l+1]
= ((W^[l+1])ᵀ · δ^[l+1]) ⊙ σ'(z^[l])
权重梯度:
∂L/∂W^[l] = δ^[l] · (a^[l-1])ᵀ / m
∂L/∂W^[l] = δ^[l] · (a^[l-1])ᵀ / m
偏置梯度:
∂L/∂b^[l] = Σᵢ δᵢ^[l] / m
∂L/∂b^[l] = Σᵢ δᵢ^[l] / m
代码实现
// 简单的神经网络实现
class NeuralNetwork {
constructor(layerSizes) {
this.layers = layerSizes.length - 1;
this.weights = [];
this.biases = [];
// 初始化权重和偏置
for (let i = 0; i < this.layers; i++) {
this.weights.push(
Array(layerSizes[i + 1]).fill(0).map(() =>
Array(layerSizes[i]).fill(0).map(() => Math.random() * 0.1)
)
);
this.biases.push(
Array(layerSizes[i + 1]).fill(0).map(() => Math.random() * 0.1)
);
}
}
// ReLU 激活函数
relu(x) {
return Math.max(0, x);
}
reluDerivative(x) {
return x > 0 ? 1 : 0;
}
// 前向传播
forward(input) {
this.activations = [input];
this.zValues = [];
let current = input;
for (let i = 0; i < this.layers; i++) {
const z = this.matmul(this.weights[i], current);
this.zValues.push(z);
// 应用激活函数
current = z.map(v => this.relu(v));
this.activations.push(current);
}
return current;
}
// 反向传播
backward(target, learningRate = 0.01) {
const m = target.length;
let delta = null;
// 从输出层开始反向传播
for (let i = this.layers - 1; i >= 0; i--) {
if (i === this.layers - 1) {
// 输出层误差
delta = this.activations[i + 1].map((a, j) => a - target[j]);
} else {
// 隐藏层误差
const newDelta = [];
for (let j = 0; j < this.zValues[i].length; j++) {
let sum = 0;
for (let k = 0; k < delta.length; k++) {
sum += this.weights[i + 1][k][j] * delta[k];
}
newDelta.push(sum * this.reluDerivative(this.zValues[i][j]));
}
delta = newDelta;
}
// 更新权重和偏置
for (let j = 0; j < this.weights[i].length; j++) {
for (let k = 0; k < this.weights[i][j].length; k++) {
this.weights[i][j][k] -= learningRate * delta[j] * this.activations[i][k] / m;
}
this.biases[i][j] -= learningRate * delta[j] / m;
}
}
}
matmul(matrix, vector) {
return matrix.map(row =>
row.reduce((sum, val, i) => sum + val * vector[i], 0)
);
}
// 训练
train(X, y, epochs = 1000, learningRate = 0.01) {
for (let epoch = 0; epoch < epochs; epoch++) {
let totalLoss = 0;
for (let i = 0; i < X.length; i++) {
const output = this.forward(X[i]);
// 计算交叉熵损失
for (let j = 0; j < y[i].length; j++) {
if (y[i][j] > 0) {
totalLoss -= y[i][j] * Math.log(Math.max(output[j], 1e-10));
}
}
this.backward(y[i], learningRate);
}
if (epoch % 100 === 0) {
console.log(`Epoch ${epoch}: Loss = ${(totalLoss / X.length).toFixed(4)}`);
}
}
}
}
// 使用示例
const nn = new NeuralNetwork([4, 6, 4, 3]);
const X = [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]];
const y = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0]];
nn.train(X, y, 1000, 0.1);import numpy as np
class NeuralNetwork:
def __init__(self, layer_sizes):
self.layers = len(layer_sizes) - 1
self.weights = []
self.biases = []
# 初始化权重和偏置
for i in range(self.layers):
w = np.random.randn(layer_sizes[i + 1], layer_sizes[i]) * 0.1
b = np.random.randn(layer_sizes[i + 1], 1) * 0.1
self.weights.append(w)
self.biases.append(b)
def relu(self, x):
return np.maximum(0, x)
def relu_derivative(self, x):
return (x > 0).astype(float)
def forward(self, X):
self.activations = [X]
self.z_values = []
current = X
for i in range(self.layers):
z = np.dot(self.weights[i], current) + self.biases[i]
self.z_values.append(z)
current = self.relu(z)
self.activations.append(current)
return current
def backward(self, Y, learning_rate=0.01):
m = Y.shape[1]
delta = None
for i in range(self.layers - 1, -1, -1):
if i == self.layers - 1:
# 输出层误差 (交叉熵损失)
delta = self.activations[i + 1] - Y
else:
# 隐藏层误差
delta = np.dot(self.weights[i + 1].T, delta) * self.relu_derivative(self.z_values[i])
# 计算梯度
dW = np.dot(delta, self.activations[i].T) / m
db = np.sum(delta, axis=1, keepdims=True) / m
# 更新权重和偏置
self.weights[i] -= learning_rate * dW
self.biases[i] -= learning_rate * db
def train(self, X, Y, epochs=1000, learning_rate=0.01):
for epoch in range(epochs):
output = self.forward(X)
self.backward(Y, learning_rate)
if epoch % 100 == 0:
# 计算交叉熵损失
loss = -np.sum(Y * np.log(output + 1e-10)) / m
print(f"Epoch {epoch}: Loss = {loss:.4f}")
# 使用示例
nn = NeuralNetwork([4, 6, 4, 3])
X = np.array([[1, 0, 0, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]]).T
Y = np.array([[1, 0, 0, 1],
[0, 1, 0, 0],
[0, 0, 1, 0]]).T
nn.train(X, Y, epochs=1000, learning_rate=0.1)package main
import (
"fmt"
"math"
"math/rand"
"time"
)
type NeuralNetwork struct {
layers int
weights [][][]float64
biases [][]float64
activations [][]float64
zValues [][]float64
}
func NewNeuralNetwork(layerSizes []int) *NeuralNetwork {
rand.Seed(time.Now().UnixNano())
nn := &NeuralNetwork{
layers: len(layerSizes) - 1,
}
// 初始化权重和偏置
for i := 0; i < nn.layers; i++ {
w := make([][]float64, layerSizes[i+1])
for j := range w {
w[j] = make([]float64, layerSizes[i])
for k := range w[j] {
w[j][k] = rand.NormFloat64() * 0.1
}
}
nn.weights = append(nn.weights, w)
b := make([]float64, layerSizes[i+1])
for j := range b {
b[j] = rand.NormFloat64() * 0.1
}
nn.biases = append(nn.biases, b)
}
return nn
}
func (nn *NeuralNetwork) relu(x float64) float64 {
if x > 0 {
return x
}
return 0
}
func (nn *NeuralNetwork) reluDerivative(x float64) float64 {
if x > 0 {
return 1
}
return 0
}
func (nn *NeuralNetwork) Forward(input []float64) []float64 {
nn.activations = [][]float64{input}
nn.zValues = [][]float64{}
current := input
for i := 0; i < nn.layers; i++ {
z := nn.matmul(nn.weights[i], current)
// 加偏置
for j := range z {
z[j] += nn.biases[i][j]
}
nn.zValues = append(nn.zValues, z)
// ReLU 激活
current = make([]float64, len(z))
for j, v := range z {
current[j] = nn.relu(v)
}
nn.activations = append(nn.activations, current)
}
return current
}
func (nn *NeuralNetwork) matmul(matrix [][]float64, vector []float64) []float64 {
result := make([]float64, len(matrix))
for i, row := range matrix {
sum := 0.0
for j, val := range row {
sum += val * vector[j]
}
result[i] = sum
}
return result
}
func (nn *NeuralNetwork) Backward(target []float64, learningRate float64) {
m := len(target)
var delta []float64
// 从输出层开始反向传播
for i := nn.layers - 1; i >= 0; i-- {
if i == nn.layers-1 {
// 输出层误差
delta = make([]float64, len(nn.activations[i+1]))
for j := range delta {
delta[j] = nn.activations[i+1][j] - target[j]
}
} else {
// 隐藏层误差
newDelta := make([]float64, len(nn.zValues[i]))
for j := range newDelta {
sum := 0.0
for k := range delta {
sum += nn.weights[i+1][k][j] * delta[k]
}
newDelta[j] = sum * nn.reluDerivative(nn.zValues[i][j])
}
delta = newDelta
}
// 更新权重和偏置
for j := range nn.weights[i] {
for k := range nn.weights[i][j] {
nn.weights[i][j][k] -= learningRate * delta[j] * nn.activations[i][k] / float64(m)
}
nn.biases[i][j] -= learningRate * delta[j] / float64(m)
}
}
}
func (nn *NeuralNetwork) Train(X [][]float64, Y [][]float64, epochs int, learningRate float64) {
for epoch := 0; epoch < epochs; epoch++ {
totalLoss := 0.0
for i := range X {
output := nn.Forward(X[i])
// 计算交叉熵损失
for j := range Y[i] {
if Y[i][j] > 0 {
totalLoss -= Y[i][j] * math.Log(math.Max(output[j], 1e-10))
}
}
nn.Backward(Y[i], learningRate)
}
if epoch%100 == 0 {
fmt.Printf("Epoch %d: Loss = %.4f
", epoch, totalLoss/float64(len(X)))
}
}
}
func main() {
nn := NewNeuralNetwork([]int{4, 6, 4, 3})
X := [][]float64{
{1, 0, 0, 0},
{0, 1, 0, 0},
{0, 0, 1, 0},
{0, 0, 0, 1},
}
Y := [][]float64{
{1, 0, 0},
{0, 1, 0},
{0, 0, 1},
{1, 0, 0},
}
nn.Train(X, Y, 1000, 0.1)
}
import torch
import torch.nn as nn
import torch.optim as optim
# 使用 PyTorch 实现
class NeuralNetwork(nn.Module):
def __init__(self, layer_sizes):
super().__init__()
self.layers = nn.ModuleList()
for i in range(len(layer_sizes) - 1):
self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
if i < len(layer_sizes) - 2: # 隐藏层添加 ReLU
self.layers.append(nn.ReLU())
def forward(self, x):
for layer in self.layers:
x = layer(x)
return x
# 创建模型
model = NeuralNetwork([4, 6, 4, 3])
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
# 训练数据
X = torch.tensor([[1., 0., 0., 0.],
[0., 1., 0., 0.],
[0., 0., 1., 0.],
[0., 0., 0., 1.]])
y = torch.tensor([0, 1, 2, 0]) # 类别标签
# 训练循环
for epoch in range(1000):
# 前向传播
output = model(X)
loss = criterion(output, y)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % 100 == 0:
print(f"Epoch {epoch}: Loss = {loss.item():.4f}")