🎯 监督学习算法
从标记数据中学习映射关系,预测未知数据的标签或值
📈 线性回归
建立变量间线性关系,预测连续值。最简单但强大的回归方法。
📊 逻辑回归
虽然名字叫回归,实则是分类算法。二分类问题的首选基线模型。
🔷 支持向量机
寻找最大间隔超平面,核技巧处理非线性。小样本表现优异。
🌳 决策树
模拟人类决策过程,可解释性强。易过拟合,常用作集成学习基学习器。
🌲 随机森林
多棵树集体决策,Bagging 代表。抗过拟合,性能稳定。
👥 K 近邻
物以类聚,人以群分。懒惰学习,训练快预测慢。
📧 朴素贝叶斯
基于贝叶斯定理,假设特征独立。文本分类经典算法。
💡 监督学习核心要素
📥 输入数据
特征向量 X = (x₁, x₂, ..., xₙ)
📤 输出标签
分类:离散类别 | 回归:连续值
🎯 学习目标
找到映射函数 f: X → Y
📊 训练过程
最小化损失函数,优化模型参数
📈 1. 线性回归 (Linear Regression)
算法原理
线性回归用于建立自变量和因变量之间的线性关系模型,是最基础的回归算法。
模型假设: y = wx + b + ε
损失函数(MSE): L = (1/n) Σ(yᵢ - ŷᵢ)²
正规方程解: w = (XᵀX)⁻¹Xᵀy
损失函数(MSE): L = (1/n) Σ(yᵢ - ŷᵢ)²
正规方程解: w = (XᵀX)⁻¹Xᵀy
训练时间复杂度
O(n × m × k)
预测时间复杂度
O(m)
空间复杂度
O(m)
✅ 优点
- 计算简单,易于理解
- 可解释性强
- 训练速度快
- 有闭式解
❌ 缺点
- 只能捕捉线性关系
- 对异常值敏感
- 需要特征标准化
- 多重共线性问题
代码实现
class LinearRegression {
constructor(learningRate = 0.01, nIterations = 1000) {
this.learningRate = learningRate;
this.nIterations = nIterations;
this.weights = null;
this.bias = null;
}
fit(X, y) {
const nSamples = X.length;
const nFeatures = X[0].length;
this.weights = new Array(nFeatures).fill(0);
this.bias = 0;
for (let iter = 0; iter < this.nIterations; iter++) {
const yPred = new Array(nSamples);
for (let i = 0; i < nSamples; i++) {
let sum = this.bias;
for (let j = 0; j < nFeatures; j++) {
sum += X[i][j] * this.weights[j];
}
yPred[i] = sum;
}
let dw = new Array(nFeatures).fill(0);
let db = 0;
for (let i = 0; i < nSamples; i++) {
const error = yPred[i] - y[i];
db += error;
for (let j = 0; j < nFeatures; j++) {
dw[j] += error * X[i][j];
}
}
db /= nSamples;
for (let j = 0; j < nFeatures; j++) {
dw[j] /= nSamples;
}
this.bias -= this.learningRate * db;
for (let j = 0; j < nFeatures; j++) {
this.weights[j] -= this.learningRate * dw[j];
}
}
}
predict(X) {
return X.map(row => {
let sum = this.bias;
for (let j = 0; j < row.length; j++) {
sum += row[j] * this.weights[j];
}
return sum;
});
}
}
// 使用示例
const X = [[1], [2], [3], [4], [5]];
const y = [2, 4, 5, 4, 5];
const model = new LinearRegression(0.01, 1000);
model.fit(X, y);
const predictions = model.predict([[6], [7]]);
console.log('Predictions:', predictions);import numpy as np
class LinearRegression:
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.learning_rate = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iterations):
y_pred = np.dot(X, self.weights) + self.bias
dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
db = (1/n_samples) * np.sum(y_pred - y)
self.weights -= self.learning_rate * dw
self.bias -= self.learning_rate * db
def predict(self, X):
return np.dot(X, self.weights) + self.bias
# 使用示例
if __name__ == "__main__":
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression(learning_rate=0.01, n_iterations=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.2f}")
print(f"R² Score: {r2:.2f}")package main
import (
"fmt"
"math/rand"
)
type LinearRegression struct {
learningRate float64
nIterations int
weights []float64
bias float64
}
func NewLinearRegression(lr float64, iters int) *LinearRegression {
return &LinearRegression{
learningRate: lr,
nIterations: iters,
}
}
func (lr *LinearRegression) Fit(X [][]float64, y []float64) {
nSamples := len(X)
nFeatures := len(X[0])
lr.weights = make([]float64, nFeatures)
lr.bias = 0
for iter := 0; iter < lr.nIterations; iter++ {
yPred := make([]float64, nSamples)
for i := 0; i < nSamples; i++ {
sum := lr.bias
for j := 0; j < nFeatures; j++ {
sum += X[i][j] * lr.weights[j]
}
yPred[i] = sum
}
dw := make([]float64, nFeatures)
var db float64
for i := 0; i < nSamples; i++ {
error := yPred[i] - y[i]
db += error
for j := 0; j < nFeatures; j++ {
dw[j] += error * X[i][j]
}
}
for j := 0; j < nFeatures; j++ {
dw[j] /= float64(nSamples)
}
db /= float64(nSamples)
lr.bias -= lr.learningRate * db
for j := 0; j < nFeatures; j++ {
lr.weights[j] -= lr.learningRate * dw[j]
}
}
}
func (lr *LinearRegression) Predict(X [][]float64) []float64 {
predictions := make([]float64, len(X))
for i, row := range X {
sum := lr.bias
for j, val := range row {
sum += val * lr.weights[j]
}
predictions[i] = sum
}
return predictions
}
func main() {
X := [][]float64{{1}, {2}, {3}, {4}, {5}}
y := []float64{2, 4, 5, 4, 5}
model := NewLinearRegression(0.01, 1000)
model.Fit(X, y)
preds := model.Predict([][]float64{{6}, {7}})
fmt.Printf("Predictions: %v\n", preds)
}📊 2. 逻辑回归 (Logistic Regression)
算法原理
逻辑回归用于二分类问题,使用 Sigmoid 函数将线性组合映射到 [0,1] 区间,输出属于某类的概率。
Sigmoid 函数: σ(z) = 1 / (1 + e⁻ᶻ)
预测概率: P(y=1|x) = σ(wx + b)
损失函数: L = -Σ[yᵢlog(ŷᵢ) + (1-yᵢ)log(1-ŷᵢ)]
预测概率: P(y=1|x) = σ(wx + b)
损失函数: L = -Σ[yᵢlog(ŷᵢ) + (1-yᵢ)log(1-ŷᵢ)]
训练时间复杂度
O(n × m × k)
预测时间复杂度
O(m)
空间复杂度
O(m)
✅ 优点
- 输出概率,可解释性强
- 计算代价低
- 不易过拟合
- 支持在线学习
❌ 缺点
- 只能处理线性边界
- 对特征相关性敏感
- 特征不平衡时效果差
- 需要特征缩放
代码实现
class LogisticRegression {
constructor(learningRate = 0.01, nIterations = 1000) {
this.learningRate = learningRate;
this.nIterations = nIterations;
this.weights = null;
this.bias = null;
}
sigmoid(z) {
return 1 / (1 + Math.exp(-z));
}
fit(X, y) {
const nSamples = X.length;
const nFeatures = X[0].length;
this.weights = new Array(nFeatures).fill(0);
this.bias = 0;
for (let iter = 0; iter < this.nIterations; iter++) {
const yPred = new Array(nSamples);
for (let i = 0; i < nSamples; i++) {
let sum = this.bias;
for (let j = 0; j < nFeatures; j++) {
sum += X[i][j] * this.weights[j];
}
yPred[i] = this.sigmoid(sum);
}
let dw = new Array(nFeatures).fill(0);
let db = 0;
for (let i = 0; i < nSamples; i++) {
const error = yPred[i] - y[i];
db += error;
for (let j = 0; j < nFeatures; j++) {
dw[j] += error * X[i][j];
}
}
db /= nSamples;
for (let j = 0; j < nFeatures; j++) {
dw[j] /= nSamples;
}
this.bias -= this.learningRate * db;
for (let j = 0; j < nFeatures; j++) {
this.weights[j] -= this.learningRate * dw[j];
}
}
}
predict_proba(X) {
return X.map(row => {
let sum = this.bias;
for (let j = 0; j < row.length; j++) {
sum += row[j] * this.weights[j];
}
return this.sigmoid(sum);
});
}
predict(X, threshold = 0.5) {
return this.predict_proba(X).map(p => p >= threshold ? 1 : 0);
}
}
// 使用示例
const X = [[1, 2], [2, 3], [3, 4], [4, 5]];
const y = [0, 0, 1, 1];
const model = new LogisticRegression(0.1, 10000);
model.fit(X, y);
console.log('预测概率:', model.predict_proba([[2.5, 3.5]]));
console.log('预测类别:', model.predict([[2.5, 3.5]]));import numpy as np
class LogisticRegression:
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.learning_rate = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
def sigmoid(self, z):
return 1 / (1 + np.exp(-z))
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iterations):
linear_model = np.dot(X, self.weights) + self.bias
y_pred = self.sigmoid(linear_model)
dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
db = (1/n_samples) * np.sum(y_pred - y)
self.weights -= self.learning_rate * dw
self.bias -= self.learning_rate * db
def predict_proba(self, X):
linear_model = np.dot(X, self.weights) + self.bias
return self.sigmoid(linear_model)
def predict(self, X, threshold=0.5):
return (self.predict_proba(X) >= threshold).astype(int)
# 使用示例
if __name__ == "__main__":
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
X, y = make_classification(n_samples=100, n_features=2, n_informative=2,
n_redundant=0, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(learning_rate=0.1, n_iterations=10000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))package main
import (
"fmt"
"math"
)
type LogisticRegression struct {
learningRate float64
nIterations int
weights []float64
bias float64
}
func NewLogisticRegression(lr float64, iters int) *LogisticRegression {
return &LogisticRegression{
learningRate: lr,
nIterations: iters,
}
}
func (lr *LogisticRegression) sigmoid(z float64) float64 {
return 1 / (1 + math.Exp(-z))
}
func (lr *LogisticRegression) Fit(X [][]float64, y []float64) {
nSamples := len(X)
nFeatures := len(X[0])
lr.weights = make([]float64, nFeatures)
lr.bias = 0
for iter := 0; iter < lr.nIterations; iter++ {
yPred := make([]float64, nSamples)
for i := 0; i < nSamples; i++ {
sum := lr.bias
for j := 0; j < nFeatures; j++ {
sum += X[i][j] * lr.weights[j]
}
yPred[i] = lr.sigmoid(sum)
}
dw := make([]float64, nFeatures)
var db float64
for i := 0; i < nSamples; i++ {
error := yPred[i] - y[i]
db += error
for j := 0; j < nFeatures; j++ {
dw[j] += error * X[i][j]
}
}
for j := 0; j < nFeatures; j++ {
dw[j] /= float64(nSamples)
}
db /= float64(nSamples)
lr.bias -= lr.learningRate * db
for j := 0; j < nFeatures; j++ {
lr.weights[j] -= lr.learningRate * dw[j]
}
}
}
func (lr *LogisticRegression) PredictProba(X [][]float64) []float64 {
probs := make([]float64, len(X))
for i, row := range X {
sum := lr.bias
for j, val := range row {
sum += val * lr.weights[j]
}
probs[i] = lr.sigmoid(sum)
}
return probs
}
func (lr *LogisticRegression) Predict(X [][]float64, threshold float64) []int {
probs := lr.PredictProba(X)
predictions := make([]int, len(probs))
for i, p := range probs {
if p >= threshold {
predictions[i] = 1
} else {
predictions[i] = 0
}
}
return predictions
}
func main() {
X := [][]float64{{1, 2}, {2, 3}, {3, 4}, {4, 5}}
y := []float64{0, 0, 1, 1}
model := NewLogisticRegression(0.1, 10000)
model.Fit(X, y)
probs := model.PredictProba([][]float64{{2.5, 3.5}})
preds := model.Predict([][]float64{{2.5, 3.5}}, 0.5)
fmt.Printf("预测概率:%v\n", probs)
fmt.Printf("预测类别:%v\n", preds)
}
🎯 算法选择指南
- 数据量小、特征少:逻辑回归、SVM
- 数据量大、特征多:随机森林、梯度提升树
- 需要可解释性:决策树、线性模型
- 文本分类:朴素贝叶斯、逻辑回归
- 特征间关系复杂:集成方法、神经网络
- 快速原型开发:逻辑回归(基线模型)
🔷 3. 支持向量机 (SVM)
算法原理
SVM 通过寻找最大间隔超平面来进行分类,对于线性不可分数据,使用核技巧映射到高维空间。
优化目标: min (1/2)||w||²
约束条件: yᵢ(wxᵢ + b) ≥ 1
核函数: K(xᵢ, xⱼ) = φ(xᵢ)·φ(xⱼ)
约束条件: yᵢ(wxᵢ + b) ≥ 1
核函数: K(xᵢ, xⱼ) = φ(xᵢ)·φ(xⱼ)
训练时间复杂度
O(n² × m)
预测时间复杂度
O(k × m)
空间复杂度
O(k)
✅ 优点
- 最大间隔,泛化能力强
- 核技巧处理非线性
- 高维空间表现好
- 避免局部最优
❌ 缺点
- 大规模数据训练慢
- 核函数选择敏感
- 多分类需要扩展
- 缺失值敏感
代码实现
// 简化的 SVM 实现(使用梯度下降)
class SVM {
constructor(learningRate = 0.001, lambdaParam = 0.01, nIterations = 1000) {
this.learningRate = learningRate;
this.lambdaParam = lambdaParam;
this.nIterations = nIterations;
this.weights = null;
this.bias = null;
}
fit(X, y) {
const nSamples = X.length;
const nFeatures = X[0].length;
this.weights = new Array(nFeatures).fill(0);
this.bias = 0;
// 将标签转换为 -1 和 1
const yTransformed = y.map(label => label === 0 ? -1 : 1);
for (let iter = 0; iter < this.nIterations; iter++) {
for (let i = 0; i < nSamples; i++) {
const condition = yTransformed[i] * (this._dotProduct(X[i], this.weights) + this.bias) >= 1;
if (condition) {
// 梯度下降更新权重(正则化项)
for (let j = 0; j < nFeatures; j++) {
this.weights[j] -= this.learningRate * (2 * this.lambdaParam * this.weights[j]);
}
this.bias -= this.learningRate * 2 * this.lambdaParam * this.bias;
} else {
// 梯度下降更新权重(损失项 + 正则化项)
for (let j = 0; j < nFeatures; j++) {
this.weights[j] -= this.learningRate * (2 * this.lambdaParam * this.weights[j] - yTransformed[i] * X[i][j]);
}
this.bias -= this.learningRate * (-yTransformed[i]);
}
}
}
}
_dotProduct(arr1, arr2) {
return arr1.reduce((sum, val, i) => sum + val * arr2[i], 0);
}
predict(X) {
return X.map(row => {
const linearModel = this._dotProduct(row, this.weights) + this.bias;
return linearModel >= 0 ? 1 : 0;
});
}
}
// 使用示例
const X = [[1, 2], [2, 3], [3, 3], [4, 5], [5, 5]];
const y = [0, 0, 0, 1, 1];
const model = new SVM(0.001, 0.01, 1000);
model.fit(X, y);
console.log('预测类别:', model.predict([[3, 4], [4, 4]]));import numpy as np
class SVM:
def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iterations=1000):
self.learning_rate = learning_rate
self.lambda_param = lambda_param
self.n_iterations = n_iterations
self.weights = None
self.bias = None
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
# 将标签转换为 -1 和 1
y_transformed = np.where(y <= 0, -1, 1)
for _ in range(self.n_iterations):
for idx, x_i in enumerate(X):
condition = y_transformed[idx] * (np.dot(x_i, self.weights) - self.bias) >= 1
if condition:
self.weights -= self.learning_rate * (2 * self.lambda_param * self.weights)
else:
self.weights -= self.learning_rate * (2 * self.lambda_param * self.weights - np.dot(x_i, y_transformed[idx]))
self.bias -= self.learning_rate * y_transformed[idx]
def predict(self, X):
linear_output = np.dot(X, self.weights) - self.bias
return np.where(linear_output >= 0, 1, 0)
# 使用示例
if __name__ == "__main__":
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X, y = make_classification(n_samples=100, n_features=2, n_informative=2,
n_redundant=0, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = SVM(learning_rate=0.001, lambda_param=0.01, n_iterations=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")package main
import "fmt"
type SVM struct {
learningRate float64
lambdaParam float64
nIterations int
weights []float64
bias float64
}
func NewSVM(lr, lambda float64, iters int) *SVM {
return &SVM{
learningRate: lr,
lambdaParam: lambda,
nIterations: iters,
}
}
func (svm *SVM) dotProduct(a, b []float64) float64 {
sum := 0.0
for i := range a {
sum += a[i] * b[i]
}
return sum
}
func (svm *SVM) Fit(X [][]float64, y []int) {
nSamples := len(X)
nFeatures := len(X[0])
svm.weights = make([]float64, nFeatures)
svm.bias = 0
// 将标签转换为 -1 和 1
yTransformed := make([]int, nSamples)
for i, label := range y {
if label <= 0 {
yTransformed[i] = -1
} else {
yTransformed[i] = 1
}
}
for iter := 0; iter < svm.nIterations; iter++ {
for idx := 0; idx < nSamples; idx++ {
xI := X[idx]
condition := float64(yTransformed[idx]) * (svm.dotProduct(xI, svm.weights) - svm.bias) >= 1
if condition {
for j := 0; j < nFeatures; j++ {
svm.weights[j] -= svm.learningRate * (2 * svm.lambdaParam * svm.weights[j])
}
} else {
for j := 0; j < nFeatures; j++ {
svm.weights[j] -= svm.learningRate * (2 * svm.lambdaParam * svm.weights[j] - float64(yTransformed[idx]) * xI[j])
}
svm.bias -= svm.learningRate * float64(yTransformed[idx])
}
}
}
}
func (svm *SVM) Predict(X [][]float64) []int {
predictions := make([]int, len(X))
for i, row := range X {
linearOutput := svm.dotProduct(row, svm.weights) - svm.bias
if linearOutput >= 0 {
predictions[i] = 1
} else {
predictions[i] = 0
}
}
return predictions
}
func main() {
X := [][]float64{{1, 2}, {2, 3}, {3, 3}, {4, 5}, {5, 5}}
y := []int{0, 0, 0, 1, 1}
model := NewSVM(0.001, 0.01, 1000)
model.Fit(X, y)
preds := model.Predict([][]float64{{3, 4}, {4, 4}})
fmt.Printf("预测类别:%v\n", preds)
}🌳 4. 决策树 (Decision Tree)
算法原理
决策树通过递归地选择最优特征进行数据划分,构建树形结构。常用划分准则包括信息增益、信息增益比和基尼指数。
信息熵: H(D) = -Σ pᵢ log₂ pᵢ
信息增益: IG(D, A) = H(D) - H(D|A)
基尼指数: Gini(D) = 1 - Σ pᵢ²
信息增益: IG(D, A) = H(D) - H(D|A)
基尼指数: Gini(D) = 1 - Σ pᵢ²
训练时间复杂度
O(n × m × d)
预测时间复杂度
O(d)
空间复杂度
O(nodes)
✅ 优点
- 可解释性强,可视化直观
- 不需要特征缩放
- 能处理数值和类别特征
- 自动特征选择
❌ 缺点
- 容易过拟合
- 对数据变化敏感
- 可能不连续
- 偏向取值多的特征
代码实现
class TreeNode {
constructor(feature = null, threshold = null, left = null, right = null, value = null) {
this.feature = feature;
this.threshold = threshold;
this.left = left;
this.right = right;
this.value = value;
}
}
class DecisionTree {
constructor(maxDepth = 10, minSamplesSplit = 2) {
this.maxDepth = maxDepth;
this.minSamplesSplit = minSamplesSplit;
this.root = null;
}
fit(X, y) {
this.nClasses = new Set(y).size;
this.nFeatures = X[0].length;
this.root = this._growTree(X, y);
}
_growTree(X, y, depth = 0) {
const nSamples = X.length;
const nLabels = new Set(y).size;
// 停止条件
if (depth >= this.maxDepth || nLabels === 1 || nSamples < this.minSamplesSplit) {
const leafValue = this._mostCommonLabel(y);
return new TreeNode(value = leafValue);
}
// 寻找最佳划分
const { featureIdx, threshold } = this._bestSplit(X, y);
// 划分数据
const leftMask = X.map(row => row[featureIdx] <= threshold);
const rightMask = leftMask.map(v => !v);
const XLeft = X.filter((_, i) => leftMask[i]);
const yLeft = y.filter((_, i) => leftMask[i]);
const XRight = X.filter((_, i) => rightMask[i]);
const yRight = y.filter((_, i) => rightMask[i]);
// 递归构建子树
const left = this._growTree(XLeft, yLeft, depth + 1);
const right = this._growTree(XRight, yRight, depth + 1);
return new TreeNode(featureIdx, threshold, left, right);
}
_bestSplit(X, y) {
let bestGain = -Infinity;
let bestFeature = 0;
let bestThreshold = 0;
const parentGini = this._giniIndex(y);
for (let featureIdx = 0; featureIdx < this.nFeatures; featureIdx++) {
const thresholds = [...new Set(X.map(row => row[featureIdx]))];
for (const threshold of thresholds) {
const gain = this._informationGain(y, X, featureIdx, threshold);
if (gain > bestGain) {
bestGain = gain;
bestFeature = featureIdx;
bestThreshold = threshold;
}
}
}
return { featureIdx: bestFeature, threshold: bestThreshold };
}
_informationGain(parentY, X, featureIdx, threshold) {
const leftMask = X.map(row => row[featureIdx] <= threshold);
const n = parentY.length;
const nLeft = leftMask.filter(v => v).length;
if (nLeft === 0 || nLeft === n) return 0;
const yLeft = parentY.filter((_, i) => leftMask[i]);
const yRight = parentY.filter((_, i) => !leftMask[i]);
return parentY.length - (nLeft * this._giniIndex(yLeft) + (n - nLeft) * this._giniIndex(yRight));
}
_giniIndex(y) {
const counts = {};
y.forEach(label => counts[label] = (counts[label] || 0) + 1);
return 1 - Object.values(counts).reduce((sum, count) => sum + Math.pow(count / y.length, 2), 0);
}
_mostCommonLabel(y) {
const counts = {};
y.forEach(label => counts[label] = (counts[label] || 0) + 1);
return Object.entries(counts).sort((a, b) => b[1] - a[1])[0][0];
}
predict(X) {
return X.map(row => this._traverseTree(row, this.root));
}
_traverseTree(x, node) {
if (node.value !== null) return node.value;
if (x[node.feature] <= node.threshold) {
return this._traverseTree(x, node.left);
}
return this._traverseTree(x, node.right);
}
}
// 使用示例
const X = [[1, 2], [2, 3], [3, 4], [5, 6], [6, 7], [7, 8]];
const y = [0, 0, 0, 1, 1, 1];
const tree = new DecisionTree(maxDepth = 3);
tree.fit(X, y);
console.log('预测类别:', tree.predict([[4, 5], [6, 6]]));class TreeNode:
def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
self.feature = feature
self.threshold = threshold
self.left = left
self.right = right
self.value = value
class DecisionTree:
def __init__(self, max_depth=10, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.root = None
def fit(self, X, y):
self.n_classes = len(set(y))
self.n_features = X.shape[1]
self.root = self._grow_tree(X, y)
def _grow_tree(self, X, y, depth=0):
n_samples = len(X)
n_labels = len(set(y))
# 停止条件
if depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split:
return TreeNode(value=self._most_common_label(y))
# 寻找最佳划分
feature_idx, threshold = self._best_split(X, y)
# 划分数据
left_mask = X[:, feature_idx] <= threshold
X_left, y_left = X[left_mask], y[left_mask]
X_right, y_right = X[~left_mask], y[~left_mask]
# 递归构建子树
left = self._grow_tree(X_left, y_left, depth + 1)
right = self._grow_tree(X_right, y_right, depth + 1)
return TreeNode(feature_idx, threshold, left, right)
def _best_split(self, X, y):
best_gain = -float('inf')
best_feature = 0
best_threshold = 0
for feature_idx in range(self.n_features):
thresholds = set(X[:, feature_idx])
for threshold in thresholds:
gain = self._information_gain(y, X, feature_idx, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature_idx
best_threshold = threshold
return best_feature, best_threshold
def _information_gain(self, y, X, feature_idx, threshold):
left_mask = X[:, feature_idx] <= threshold
y_left = y[left_mask]
y_right = y[~left_mask]
n = len(y)
n_left = len(y_left)
if n_left == 0 or n_left == n:
return 0
return self._gini_index(y) - (n_left / n * self._gini_index(y_left) + (n - n_left) / n * self._gini_index(y_right))
def _gini_index(self, y):
counts = {}
for label in y:
counts[label] = counts.get(label, 0) + 1
return 1 - sum((count / len(y)) ** 2 for count in counts.values())
def _most_common_label(self, y):
counts = {}
for label in y:
counts[label] = counts.get(label, 0) + 1
return max(counts, key=counts.get)
def predict(self, X):
return [self._traverse_tree(x, self.root) for x in X]
def _traverse_tree(self, x, node):
if node.value is not None:
return node.value
if x[node.feature] <= node.threshold:
return self._traverse_tree(x, node.left)
return self._traverse_tree(x, node.right)
# 使用示例
if __name__ == "__main__":
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
X, y = make_classification(n_samples=100, n_features=4, n_informative=4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tree = DecisionTree(max_depth=5)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")package main
import "fmt"
type TreeNode struct {
feature int
threshold float64
left *TreeNode
right *TreeNode
value *int
}
type DecisionTree struct {
maxDepth int
minSamplesSplit int
root *TreeNode
nFeatures int
}
func NewDecisionTree(maxDepth, minSamplesSplit int) *DecisionTree {
return &DecisionTree{
maxDepth: maxDepth,
minSamplesSplit: minSamplesSplit,
}
}
func (dt *DecisionTree) Fit(X [][]float64, y []int) {
dt.nFeatures = len(X[0])
dt.root = dt.growTree(X, y, 0)
}
func (dt *DecisionTree) growTree(X [][]float64, y []int, depth int) *TreeNode {
nSamples := len(X)
nLabels := len(dt.uniqueLabels(y))
// 停止条件
if depth >= dt.maxDepth || nLabels == 1 || nSamples < dt.minSamplesSplit {
value := dt.mostCommonLabel(y)
return &TreeNode{value: &value}
}
// 寻找最佳划分
featureIdx, threshold := dt.bestSplit(X, y)
// 划分数据
var XLeft, XRight [][]float64
var yLeft, yRight []int
for i, row := range X {
if row[featureIdx] <= threshold {
XLeft = append(XLeft, row)
yLeft = append(yLeft, y[i])
} else {
XRight = append(XRight, row)
yRight = append(yRight, y[i])
}
}
// 递归构建子树
left := dt.growTree(XLeft, yLeft, depth+1)
right := dt.growTree(XRight, yRight, depth+1)
return &TreeNode{feature: featureIdx, threshold: threshold, left: left, right: right}
}
func (dt *DecisionTree) bestSplit(X [][]float64, y []int) (int, float64) {
bestGain := -1.0
bestFeature := 0
bestThreshold := 0.0
parentGini := dt.giniIndex(y)
for featureIdx := 0; featureIdx < dt.nFeatures; featureIdx++ {
thresholds := dt.uniqueThresholds(X, featureIdx)
for _, threshold := range thresholds {
gain := dt.informationGain(y, X, featureIdx, threshold)
if gain > bestGain {
bestGain = gain
bestFeature = featureIdx
bestThreshold = threshold
}
}
}
return bestFeature, bestThreshold
}
func (dt *DecisionTree) informationGain(y []int, X [][]float64, featureIdx int, threshold float64) float64 {
var yLeft, yRight []int
for i, row := range X {
if row[featureIdx] <= threshold {
yLeft = append(yLeft, y[i])
} else {
yRight = append(yRight, y[i])
}
}
n := len(y)
nLeft := len(yLeft)
if nLeft == 0 || nLeft == n {
return 0
}
return float64(n)*dt.giniIndex(y) - float64(nLeft)*dt.giniIndex(yLeft) - float64(n-nLeft)*dt.giniIndex(yRight)
}
func (dt *DecisionTree) giniIndex(y []int) float64 {
counts := make(map[int]int)
for _, label := range y {
counts[label]++
}
gini := 1.0
for _, count := range counts {
p := float64(count) / float64(len(y))
gini -= p * p
}
return gini
}
func (dt *DecisionTree) uniqueLabels(y []int) []int {
unique := make(map[int]bool)
for _, label := range y {
unique[label] = true
}
result := make([]int, 0, len(unique))
for label := range unique {
result = append(result, label)
}
return result
}
func (dt *DecisionTree) uniqueThresholds(X [][]float64, featureIdx int) []float64 {
unique := make(map[float64]bool)
for _, row := range X {
unique[row[featureIdx]] = true
}
result := make([]float64, 0, len(unique))
for threshold := range unique {
result = append(result, threshold)
}
return result
}
func (dt *DecisionTree) mostCommonLabel(y []int) int {
counts := make(map[int]int)
for _, label := range y {
counts[label]++
}
maxCount := 0
maxLabel := y[0]
for label, count := range counts {
if count > maxCount {
maxCount = count
maxLabel = label
}
}
return maxLabel
}
func (dt *DecisionTree) Predict(X [][]float64) []int {
predictions := make([]int, len(X))
for i, row := range X {
predictions[i] = dt.traverseTree(row, dt.root)
}
return predictions
}
func (dt *DecisionTree) traverseTree(x []float64, node *TreeNode) int {
if node.value != nil {
return *node.value
}
if x[node.feature] <= node.threshold {
return dt.traverseTree(x, node.left)
}
return dt.traverseTree(x, node.right)
}
func main() {
X := [][]float64{{1, 2}, {2, 3}, {3, 4}, {5, 6}, {6, 7}, {7, 8}}
y := []int{0, 0, 0, 1, 1, 1}
tree := NewDecisionTree(3, 2)
tree.Fit(X, y)
preds := tree.Predict([][]float64{{4, 5}, {6, 6}})
fmt.Printf("预测类别:%v\n", preds)
}🌲 5. 随机森林 (Random Forest)
算法原理
随机森林是集成学习算法,通过构建多棵决策树并综合它们的预测结果来提高准确性和稳定性。采用 Bagging 和随机特征选择。
Bagging: 有放回抽样构建训练集
随机特征: 每棵树随机选择 √m 个特征
投票机制: ŷ = mode{T₁(x), T₂(x), ..., Tₙ(x)}
随机特征: 每棵树随机选择 √m 个特征
投票机制: ŷ = mode{T₁(x), T₂(x), ..., Tₙ(x)}
训练时间复杂度
O(k × n × m × d)
预测时间复杂度
O(k × d)
空间复杂度
O(k × nodes)
✅ 优点
- 抗过拟合能力强
- 处理高维数据
- 可评估特征重要性
- 性能稳定准确
❌ 缺点
- 训练时间较长
- 可解释性较差
- 内存占用较大
- 小数据表现一般
代码实现
class RandomForest {
constructor(nTrees = 100, maxDepth = 10, maxFeatures = null) {
this.nTrees = nTrees;
this.maxDepth = maxDepth;
this.maxFeatures = maxFeatures;
this.trees = [];
}
fit(X, y) {
const nSamples = X.length;
const nFeatures = X[0].length;
this.maxFeatures = this.maxFeatures || Math.floor(Math.sqrt(nFeatures));
this.trees = [];
for (let i = 0; i < this.nTrees; i++) {
// Bootstrap 采样
const { XSample, ySample } = this._bootstrapSample(X, y);
// 训练决策树
const tree = this._growTree(XSample, ySample, 0, nFeatures);
this.trees.push(tree);
}
}
_bootstrapSample(X, y) {
const nSamples = X.length;
const XSample = [];
const ySample = [];
for (let i = 0; i < nSamples; i++) {
const idx = Math.floor(Math.random() * nSamples);
XSample.push(X[idx]);
ySample.push(y[idx]);
}
return { XSample, ySample };
}
_growTree(X, y, depth, nFeatures) {
const nSamples = X.length;
const nLabels = new Set(y).size;
if (depth >= this.maxDepth || nLabels === 1 || nSamples < 2) {
return { value: this._mostCommonLabel(y) };
}
const { featureIdx, threshold } = this._bestSplit(X, y, nFeatures);
const leftMask = X.map(row => row[featureIdx] <= threshold);
const XLeft = X.filter((_, i) => leftMask[i]);
const yLeft = y.filter((_, i) => leftMask[i]);
const XRight = X.filter((_, i) => !leftMask[i]);
const yRight = y.filter((_, i) => !leftMask[i]);
return {
feature: featureIdx,
threshold,
left: this._growTree(XLeft, yLeft, depth + 1, nFeatures),
right: this._growTree(XRight, yRight, depth + 1, nFeatures)
};
}
_bestSplit(X, y, nFeatures) {
let bestGain = -Infinity;
let bestFeature = 0;
let bestThreshold = 0;
// 随机选择特征
const featureIndices = [];
while (featureIndices.length < nFeatures) {
const idx = Math.floor(Math.random() * X[0].length);
if (!featureIndices.includes(idx)) featureIndices.push(idx);
}
for (const featureIdx of featureIndices) {
const thresholds = [...new Set(X.map(row => row[featureIdx]))];
for (const threshold of thresholds) {
const gain = this._informationGain(y, X, featureIdx, threshold);
if (gain > bestGain) {
bestGain = gain;
bestFeature = featureIdx;
bestThreshold = threshold;
}
}
}
return { featureIdx: bestFeature, threshold: bestThreshold };
}
_informationGain(parentY, X, featureIdx, threshold) {
const leftMask = X.map(row => row[featureIdx] <= threshold);
const n = parentY.length;
const nLeft = leftMask.filter(v => v).length;
if (nLeft === 0 || nLeft === n) return 0;
const yLeft = parentY.filter((_, i) => leftMask[i]);
const yRight = parentY.filter((_, i) => !leftMask[i]);
return n - (nLeft * this._giniIndex(yLeft) + (n - nLeft) * this._giniIndex(yRight));
}
_giniIndex(y) {
const counts = {};
y.forEach(label => counts[label] = (counts[label] || 0) + 1);
return 1 - Object.values(counts).reduce((sum, count) => sum + Math.pow(count / y.length, 2), 0);
}
_mostCommonLabel(y) {
const counts = {};
y.forEach(label => counts[label] = (counts[label] || 0) + 1);
return Object.entries(counts).sort((a, b) => b[1] - a[1])[0][0];
}
predict(X) {
// 每棵树预测,然后投票
const predictions = X.map(row => {
const treePreds = this.trees.map(tree => this._predictTree(row, tree));
return this._mostCommonLabel(treePreds);
});
return predictions;
}
_predictTree(x, node) {
if (node.value !== undefined) return node.value;
if (x[node.feature] <= node.threshold) {
return this._predictTree(x, node.left);
}
return this._predictTree(x, node.right);
}
}
// 使用示例
const X = [[1, 2], [2, 3], [3, 4], [5, 6], [6, 7], [7, 8]];
const y = [0, 0, 0, 1, 1, 1];
const rf = new RandomForest(nTrees = 10);
rf.fit(X, y);
console.log('预测类别:', rf.predict([[4, 5], [6, 6]]));import numpy as np
from collections import Counter
class RandomForest:
def __init__(self, n_trees=100, max_depth=10, max_features=None):
self.n_trees = n_trees
self.max_depth = max_depth
self.max_features = max_features
self.trees = []
def fit(self, X, y):
self.n_samples, self.n_features = X.shape
self.max_features = self.max_features or int(np.sqrt(self.n_features))
self.trees = []
for _ in range(self.n_trees):
# Bootstrap 采样
X_sample, y_sample = self._bootstrap_sample(X, y)
# 训练决策树
tree = self._grow_tree(X_sample, y_sample, 0)
self.trees.append(tree)
def _bootstrap_sample(self, X, y):
indices = np.random.choice(len(X), len(X), replace=True)
return X[indices], y[indices]
def _grow_tree(self, X, y, depth):
n_samples = len(X)
n_labels = len(set(y))
if depth >= self.max_depth or n_labels == 1 or n_samples < 2:
return {'value': self._most_common_label(y)}
feature_idx, threshold = self._best_split(X, y)
left_mask = X[:, feature_idx] <= threshold
X_left, y_left = X[left_mask], y[left_mask]
X_right, y_right = X[~left_mask], y[~left_mask]
return {
'feature': feature_idx,
'threshold': threshold,
'left': self._grow_tree(X_left, y_left, depth + 1),
'right': self._grow_tree(X_right, y_right, depth + 1)
}
def _best_split(self, X, y):
best_gain = -float('inf')
best_feature = 0
best_threshold = 0
# 随机选择特征
feature_indices = np.random.choice(self.n_features, self.max_features, replace=False)
for feature_idx in feature_indices:
thresholds = set(X[:, feature_idx])
for threshold in thresholds:
gain = self._information_gain(y, X, feature_idx, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature_idx
best_threshold = threshold
return best_feature, best_threshold
def _information_gain(self, y, X, feature_idx, threshold):
left_mask = X[:, feature_idx] <= threshold
y_left = y[left_mask]
y_right = y[~left_mask]
n = len(y)
n_left = len(y_left)
if n_left == 0 or n_left == n:
return 0
return self._gini_index(y) - (n_left / n * self._gini_index(y_left) + (n - n_left) / n * self._gini_index(y_right))
def _gini_index(self, y):
counts = Counter(y)
return 1 - sum((count / len(y)) ** 2 for count in counts.values())
def _most_common_label(self, y):
return Counter(y).most_common(1)[0][0]
def predict(self, X):
# 每棵树预测,然后投票
predictions = [self._predict_tree(x, tree) for x in X for tree in self.trees]
predictions = np.array(predictions).reshape(len(X), self.n_trees)
return [Counter(row).most_common(1)[0][0] for row in predictions]
def _predict_tree(self, x, node):
if 'value' in node:
return node['value']
if x[node['feature']] <= node['threshold']:
return self._predict_tree(x, node['left'])
return self._predict_tree(x, node['right'])
# 使用示例
if __name__ == "__main__":
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X, y = make_classification(n_samples=100, n_features=4, n_informative=4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForest(n_trees=100, max_depth=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")package main
import (
"fmt"
"math/rand"
"time"
)
type TreeNode struct {
feature int
threshold float64
left *TreeNode
right *TreeNode
value *int
}
type RandomForest struct {
nTrees int
maxDepth int
maxFeatures int
nFeatures int
trees []*TreeNode
}
func NewRandomForest(nTrees, maxDepth int) *RandomForest {
rand.Seed(time.Now().UnixNano())
return &RandomForest{
nTrees: nTrees,
maxDepth: maxDepth,
}
}
func (rf *RandomForest) Fit(X [][]float64, y []int) {
rf.nFeatures = len(X[0])
rf.maxFeatures = int(math.Sqrt(float64(rf.nFeatures)))
rf.trees = make([]*TreeNode, 0, rf.nTrees)
for i := 0; i < rf.nTrees; i++ {
XSample, ySample := rf.bootstrapSample(X, y)
tree := rf.growTree(XSample, ySample, 0)
rf.trees = append(rf.trees, tree)
}
}
func (rf *RandomForest) bootstrapSample(X [][]float64, y []int) ([][]float64, []int) {
n := len(X)
XSample := make([][]float64, n)
ySample := make([]int, n)
for i := 0; i < n; i++ {
idx := rand.Intn(n)
XSample[i] = X[idx]
ySample[i] = y[idx]
}
return XSample, ySample
}
func (rf *RandomForest) growTree(X [][]float64, y []int, depth int) *TreeNode {
nSamples := len(X)
nLabels := len(rf.uniqueLabels(y))
if depth >= rf.maxDepth || nLabels == 1 || nSamples < 2 {
value := rf.mostCommonLabel(y)
return &TreeNode{value: &value}
}
featureIdx, threshold := rf.bestSplit(X, y)
var XLeft, XRight [][]float64
var yLeft, yRight []int
for i, row := range X {
if row[featureIdx] <= threshold {
XLeft = append(XLeft, row)
yLeft = append(yLeft, y[i])
} else {
XRight = append(XRight, row)
yRight = append(yRight, y[i])
}
}
return &TreeNode{
feature: featureIdx,
threshold: threshold,
left: rf.growTree(XLeft, yLeft, depth+1),
right: rf.growTree(XRight, yRight, depth+1),
}
}
func (rf *RandomForest) bestSplit(X [][]float64, y []int) (int, float64) {
bestGain := -1.0
bestFeature := 0
bestThreshold := 0.0
// 随机选择特征
featureIndices := make([]int, 0, rf.maxFeatures)
for len(featureIndices) < rf.maxFeatures {
idx := rand.Intn(rf.nFeatures)
found := false
for _, f := range featureIndices {
if f == idx {
found = true
break
}
}
if !found {
featureIndices = append(featureIndices, idx)
}
}
for _, featureIdx := range featureIndices {
thresholds := rf.uniqueThresholds(X, featureIdx)
for _, threshold := range thresholds {
gain := rf.informationGain(y, X, featureIdx, threshold)
if gain > bestGain {
bestGain = gain
bestFeature = featureIdx
bestThreshold = threshold
}
}
}
return bestFeature, bestThreshold
}
func (rf *RandomForest) informationGain(y []int, X [][]float64, featureIdx int, threshold float64) float64 {
var yLeft, yRight []int
for i, row := range X {
if row[featureIdx] <= threshold {
yLeft = append(yLeft, y[i])
} else {
yRight = append(yRight, y[i])
}
}
n := len(y)
nLeft := len(yLeft)
if nLeft == 0 || nLeft == n {
return 0
}
return float64(n)*rf.giniIndex(y) - float64(nLeft)*rf.giniIndex(yLeft) - float64(n-nLeft)*rf.giniIndex(yRight)
}
func (rf *RandomForest) giniIndex(y []int) float64 {
counts := make(map[int]int)
for _, label := range y {
counts[label]++
}
gini := 1.0
for _, count := range counts {
p := float64(count) / float64(len(y))
gini -= p * p
}
return gini
}
func (rf *RandomForest) uniqueLabels(y []int) []int {
unique := make(map[int]bool)
for _, label := range y {
unique[label] = true
}
result := make([]int, 0, len(unique))
for label := range unique {
result = append(result, label)
}
return result
}
func (rf *RandomForest) uniqueThresholds(X [][]float64, featureIdx int) []float64 {
unique := make(map[float64]bool)
for _, row := range X {
unique[row[featureIdx]] = true
}
result := make([]float64, 0, len(unique))
for threshold := range unique {
result = append(result, threshold)
}
return result
}
func (rf *RandomForest) mostCommonLabel(y []int) int {
counts := make(map[int]int)
for _, label := range y {
counts[label]++
}
maxCount := 0
maxLabel := y[0]
for label, count := range counts {
if count > maxCount {
maxCount = count
maxLabel = label
}
}
return maxLabel
}
func (rf *RandomForest) Predict(X [][]float64) []int {
predictions := make([]int, len(X))
for i, row := range X {
votes := make(map[int]int)
for _, tree := range rf.trees {
pred := rf.predictTree(row, tree)
votes[pred]++
}
maxVotes := 0
maxLabel := 0
for label, count := range votes {
if count > maxVotes {
maxVotes = count
maxLabel = label
}
}
predictions[i] = maxLabel
}
return predictions
}
func (rf *RandomForest) predictTree(x []float64, node *TreeNode) int {
if node.value != nil {
return *node.value
}
if x[node.feature] <= node.threshold {
return rf.predictTree(x, node.left)
}
return rf.predictTree(x, node.right)
}
func main() {
X := [][]float64{{1, 2}, {2, 3}, {3, 4}, {5, 6}, {6, 7}, {7, 8}}
y := []int{0, 0, 0, 1, 1, 1}
rf := NewRandomForest(10, 3)
rf.Fit(X, y)
preds := rf.Predict([][]float64{{4, 5}, {6, 6}})
fmt.Printf("预测类别:%v\n", preds)
}👥 6. K 近邻 (K-Nearest Neighbors)
算法原理
KNN 是一种基于实例的懒惰学习算法。对于新样本,找到训练集中 K 个最近的邻居,通过多数投票或平均来确定预测值。
欧氏距离: d(x, y) = √Σ(xᵢ - yᵢ)²
曼哈顿距离: d(x, y) = Σ|xᵢ - yᵢ|
闵可夫斯基距离: d(x, y) = (Σ|xᵢ - yᵢ|^p)^(1/p)
曼哈顿距离: d(x, y) = Σ|xᵢ - yᵢ|
闵可夫斯基距离: d(x, y) = (Σ|xᵢ - yᵢ|^p)^(1/p)
训练时间复杂度
O(1)
预测时间复杂度
O(n × m)
空间复杂度
O(n × m)
✅ 优点
- 简单易懂,无需训练
- 天然支持多分类
- 对异常值不敏感
- 适合复杂决策边界
❌ 缺点
- 预测速度慢
- 需要特征缩放
- 高维效果差
- 样本不平衡敏感
代码实现
class KNN {
constructor(k = 3) {
this.k = k;
this.XTrain = [];
this.yTrain = [];
}
fit(X, y) {
this.XTrain = X;
this.yTrain = y;
}
predict(X) {
return X.map(x => this._predictOne(x));
}
_predictOne(x) {
// 计算距离
const distances = this.XTrain.map((xTrain, i) => ({
distance: this._euclideanDistance(x, xTrain),
label: this.yTrain[i]
}));
// 排序并取 K 个最近邻居
distances.sort((a, b) => a.distance - b.distance);
const kNearest = distances.slice(0, this.k);
// 多数投票
const votes = {};
kNearest.forEach(item => {
votes[item.label] = (votes[item.label] || 0) + 1;
});
return Object.entries(votes).sort((a, b) => b[1] - a[1])[0][0];
}
_euclideanDistance(x1, x2) {
return Math.sqrt(x1.reduce((sum, val, i) => sum + Math.pow(val - x2[i], 2), 0));
}
}
// 使用示例
const X = [[1, 2], [2, 3], [3, 4], [6, 7], [7, 8], [8, 9]];
const y = [0, 0, 0, 1, 1, 1];
const knn = new KNN(k = 3);
knn.fit(X, y);
console.log('预测类别:', knn.predict([[4, 5], [5, 6]]));import numpy as np
from collections import Counter
class KNN:
def __init__(self, k=3):
self.k = k
self.XTrain = None
self.yTrain = None
def fit(self, X, y):
self.XTrain = np.array(X)
self.yTrain = np.array(y)
def predict(self, X):
return [self._predict_one(x) for x in X]
def _predict_one(self, x):
# 计算距离
distances = [self._euclidean_distance(x, xTrain) for xTrain in self.XTrain]
# 获取 K 个最近邻居的标签
kIndices = np.argsort(distances)[:self.k]
kLabels = self.yTrain[kIndices]
# 多数投票
return Counter(kLabels).most_common(1)[0][0]
def _euclidean_distance(self, x1, x2):
return np.sqrt(np.sum((np.array(x1) - np.array(x2)) ** 2))
# 使用示例
if __name__ == "__main__":
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X, y = make_classification(n_samples=100, n_features=4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
knn = KNN(k=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")package main
import (
"fmt"
"math"
"sort"
)
type KNN struct {
k int
XTrain [][]float64
yTrain []int
}
func NewKNN(k int) *KNN {
return &KNN{k: k}
}
func (knn *KNN) Fit(X [][]float64, y []int) {
knn.XTrain = X
knn.yTrain = y
}
func (knn *KNN) Predict(X [][]float64) []int {
predictions := make([]int, len(X))
for i, x := range X {
predictions[i] = knn.predictOne(x)
}
return predictions
}
func (knn *KNN) predictOne(x []float64) int {
type neighbor struct {
distance float64
label int
}
// 计算距离
neighbors := make([]neighbor, len(knn.XTrain))
for i, xTrain := range knn.XTrain {
neighbors[i] = neighbor{
distance: knn.euclideanDistance(x, xTrain),
label: knn.yTrain[i],
}
}
// 排序并取 K 个最近邻居
sort.Slice(neighbors, func(i, j int) bool {
return neighbors[i].distance < neighbors[j].distance
})
// 多数投票
votes := make(map[int]int)
for i := 0; i < knn.k && i < len(neighbors); i++ {
votes[neighbors[i].label]++
}
maxVotes := 0
maxLabel := 0
for label, count := range votes {
if count > maxVotes {
maxVotes = count
maxLabel = label
}
}
return maxLabel
}
func (knn *KNN) euclideanDistance(x1, x2 []float64) float64 {
sum := 0.0
for i := range x1 {
diff := x1[i] - x2[i]
sum += diff * diff
}
return math.Sqrt(sum)
}
func main() {
X := [][]float64{{1, 2}, {2, 3}, {3, 4}, {6, 7}, {7, 8}, {8, 9}}
y := []int{0, 0, 0, 1, 1, 1}
knn := NewKNN(3)
knn.Fit(X, y)
preds := knn.Predict([][]float64{{4, 5}, {5, 6}})
fmt.Printf("预测类别:%v\n", preds)
}📧 7. 朴素贝叶斯 (Naive Bayes)
算法原理
朴素贝叶斯基于贝叶斯定理,假设特征之间相互独立。通过计算后验概率来进行分类,特别适合文本分类任务。
贝叶斯定理: P(Y|X) = P(X|Y) × P(Y) / P(X)
朴素假设: P(X|Y) = Π P(xᵢ|Y)
分类决策: ŷ = argmax P(Y) × Π P(xᵢ|Y)
朴素假设: P(X|Y) = Π P(xᵢ|Y)
分类决策: ŷ = argmax P(Y) × Π P(xᵢ|Y)
训练时间复杂度
O(n × m)
预测时间复杂度
O(k × m)
空间复杂度
O(k × m)
✅ 优点
- 训练预测都很快
- 小样本表现好
- 支持多分类
- 文本分类效果好
❌ 缺点
- 特征独立假设强
- 需要估计先验概率
- 对输入数据敏感
- 零概率问题
代码实现
class NaiveBayes {
constructor() {
this.classes = [];
this.mean = {};
this.var = {};
this.priors = {};
}
fit(X, y) {
const nSamples = X.length;
const nFeatures = X[0].length;
this.classes = [...new Set(y)];
for (const cls of this.classes) {
const XCls = X.filter((_, i) => y[i] === cls);
const prior = XCls.length / nSamples;
// 计算均值和方差
const mean = new Array(nFeatures).fill(0);
const variance = new Array(nFeatures).fill(0);
for (const row of XCls) {
for (let j = 0; j < nFeatures; j++) {
mean[j] += row[j];
}
}
for (let j = 0; j < nFeatures; j++) {
mean[j] /= XCls.length;
}
for (const row of XCls) {
for (let j = 0; j < nFeatures; j++) {
variance[j] += Math.pow(row[j] - mean[j], 2);
}
}
for (let j = 0; j < nFeatures; j++) {
variance[j] /= XCls.length;
}
this.mean[cls] = mean;
this.var[cls] = variance;
this.priors[cls] = prior;
}
}
predict(X) {
return X.map(x => {
let maxPosterior = -Infinity;
let predictedClass = this.classes[0];
for (const cls of this.classes) {
const posterior = this._calculatePosterior(x, cls);
if (posterior > maxPosterior) {
maxPosterior = posterior;
predictedClass = cls;
}
}
return predictedClass;
});
}
_calculatePosterior(x, cls) {
let logPosterior = Math.log(this.priors[cls]);
for (let j = 0; j < x.length; j++) {
const mean = this.mean[cls][j];
const variance = this.var[cls][j];
// 高斯概率密度函数的对数
logPosterior -= 0.5 * Math.log(2 * Math.PI * variance);
logPosterior -= Math.pow(x[j] - mean, 2) / (2 * variance);
}
return logPosterior;
}
}
// 使用示例
const X = [[1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [6.0, 7.0], [7.0, 8.0], [8.0, 9.0]];
const y = [0, 0, 0, 1, 1, 1];
const nb = new NaiveBayes();
nb.fit(X, y);
console.log('预测类别:', nb.predict([[4.0, 5.0], [5.0, 6.0]]));import numpy as np
class NaiveBayes:
def __init__(self):
self.classes = None
self.mean = {}
self.var = {}
self.priors = {}
def fit(self, X, y):
self.classes = np.unique(y)
nSamples, nFeatures = X.shape
for cls in self.classes:
XCls = X[y == cls]
self.priors[cls] = len(XCls) / nSamples
self.mean[cls] = np.mean(XCls, axis=0)
self.var[cls] = np.var(XCls, axis=0) + 1e-9 # 防止除零
def predict(self, X):
return [self._predict_one(x) for x in X]
def _predict_one(self, x):
posteriors = []
for cls in self.classes:
prior = np.log(self.priors[cls])
posterior = np.sum(np.log(self._gaussian_pdf(x, self.mean[cls], self.var[cls])))
posteriors.append(prior + posterior)
return self.classes[np.argmax(posteriors)]
def _gaussian_pdf(self, x, mean, var):
return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-((x - mean) ** 2) / (2 * var))
# 使用示例
if __name__ == "__main__":
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X, y = make_classification(n_samples=100, n_features=4, n_informative=4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
nb = NaiveBayes()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")package main
import (
"fmt"
"math"
)
type NaiveBayes struct {
classes []int
mean map[int][]float64
var map[int][]float64
priors map[int]float64
}
func NewNaiveBayes() *NaiveBayes {
return &NaiveBayes{
mean: make(map[int][]float64),
var: make(map[int][]float64),
priors: make(map[int]float64),
}
}
func (nb *NaiveBayes) Fit(X [][]float64, y []int) {
nSamples := len(X)
nFeatures := len(X[0])
// 获取唯一类别
classSet := make(map[int]bool)
for _, label := range y {
classSet[label] = true
}
nb.classes = make([]int, 0, len(classSet))
for cls := range classSet {
nb.classes = append(nb.classes, cls)
}
for _, cls := range nb.classes {
var XCls [][]float64
for i, label := range y {
if label == cls {
XCls = append(XCls, X[i])
}
}
nb.priors[cls] = float64(len(XCls)) / float64(nSamples)
// 计算均值
mean := make([]float64, nFeatures)
for _, row := range XCls {
for j, val := range row {
mean[j] += val
}
}
for j := range mean {
mean[j] /= float64(len(XCls))
}
nb.mean[cls] = mean
// 计算方差
variance := make([]float64, nFeatures)
for _, row := range XCls {
for j, val := range row {
variance[j] += (val - mean[j]) * (val - mean[j])
}
}
for j := range variance {
variance[j] /= float64(len(XCls))
variance[j] += 1e-9 // 防止除零
}
nb.var[cls] = variance
}
}
func (nb *NaiveBayes) Predict(X [][]float64) []int {
predictions := make([]int, len(X))
for i, x := range X {
predictions[i] = nb.predictOne(x)
}
return predictions
}
func (nb *NaiveBayes) predictOne(x []float64) int {
maxPosterior := -math.MaxFloat64
predictedClass := nb.classes[0]
for _, cls := range nb.classes {
posterior := math.Log(nb.priors[cls])
for j := range x {
posterior += math.Log(nb.gaussianPdf(x[j], nb.mean[cls][j], nb.var[cls][j]))
}
if posterior > maxPosterior {
maxPosterior = posterior
predictedClass = cls
}
}
return predictedClass
}
func (nb *NaiveBayes) gaussianPdf(x, mean, variance float64) float64 {
return (1 / math.Sqrt(2*math.Pi*variance)) * math.Exp(-math.Pow(x-mean, 2)/(2*variance))
}
func main() {
X := [][]float64{{1.0, 2.0}, {2.0, 3.0}, {3.0, 4.0}, {6.0, 7.0}, {7.0, 8.0}, {8.0, 9.0}}
y := []int{0, 0, 0, 1, 1, 1}
nb := NewNaiveBayes()
nb.Fit(X, y)
preds := nb.Predict([][]float64{{4.0, 5.0}, {5.0, 6.0}})
fmt.Printf("预测类别:%v\n", preds)
}