🎯 监督学习算法

从标记数据中学习映射关系,预测未知数据的标签或值

📈 线性回归

建立变量间线性关系,预测连续值。最简单但强大的回归方法。

📊 逻辑回归

虽然名字叫回归,实则是分类算法。二分类问题的首选基线模型。

🔷 支持向量机

寻找最大间隔超平面,核技巧处理非线性。小样本表现优异。

🌳 决策树

模拟人类决策过程,可解释性强。易过拟合,常用作集成学习基学习器。

🌲 随机森林

多棵树集体决策,Bagging 代表。抗过拟合,性能稳定。

👥 K 近邻

物以类聚,人以群分。懒惰学习,训练快预测慢。

📧 朴素贝叶斯

基于贝叶斯定理,假设特征独立。文本分类经典算法。

💡 监督学习核心要素
📥 输入数据

特征向量 X = (x₁, x₂, ..., xₙ)

📤 输出标签

分类:离散类别 | 回归:连续值

🎯 学习目标

找到映射函数 f: X → Y

📊 训练过程

最小化损失函数,优化模型参数

📈 1. 线性回归 (Linear Regression)

算法原理

线性回归用于建立自变量和因变量之间的线性关系模型,是最基础的回归算法。

模型假设: y = wx + b + ε

损失函数(MSE): L = (1/n) Σ(yᵢ - ŷᵢ)²

正规方程解: w = (XᵀX)⁻¹Xᵀy
训练时间复杂度
O(n × m × k)
预测时间复杂度
O(m)
空间复杂度
O(m)

✅ 优点

  • 计算简单,易于理解
  • 可解释性强
  • 训练速度快
  • 有闭式解

❌ 缺点

  • 只能捕捉线性关系
  • 对异常值敏感
  • 需要特征标准化
  • 多重共线性问题

代码实现

class LinearRegression {
    constructor(learningRate = 0.01, nIterations = 1000) {
        this.learningRate = learningRate;
        this.nIterations = nIterations;
        this.weights = null;
        this.bias = null;
    }

    fit(X, y) {
        const nSamples = X.length;
        const nFeatures = X[0].length;

        this.weights = new Array(nFeatures).fill(0);
        this.bias = 0;

        for (let iter = 0; iter < this.nIterations; iter++) {
            const yPred = new Array(nSamples);

            for (let i = 0; i < nSamples; i++) {
                let sum = this.bias;
                for (let j = 0; j < nFeatures; j++) {
                    sum += X[i][j] * this.weights[j];
                }
                yPred[i] = sum;
            }

            let dw = new Array(nFeatures).fill(0);
            let db = 0;

            for (let i = 0; i < nSamples; i++) {
                const error = yPred[i] - y[i];
                db += error;
                for (let j = 0; j < nFeatures; j++) {
                    dw[j] += error * X[i][j];
                }
            }

            db /= nSamples;
            for (let j = 0; j < nFeatures; j++) {
                dw[j] /= nSamples;
            }

            this.bias -= this.learningRate * db;
            for (let j = 0; j < nFeatures; j++) {
                this.weights[j] -= this.learningRate * dw[j];
            }
        }
    }

    predict(X) {
        return X.map(row => {
            let sum = this.bias;
            for (let j = 0; j < row.length; j++) {
                sum += row[j] * this.weights[j];
            }
            return sum;
        });
    }
}

// 使用示例
const X = [[1], [2], [3], [4], [5]];
const y = [2, 4, 5, 4, 5];

const model = new LinearRegression(0.01, 1000);
model.fit(X, y);

const predictions = model.predict([[6], [7]]);
console.log('Predictions:', predictions);
import numpy as np

class LinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iterations):
            y_pred = np.dot(X, self.weights) + self.bias

            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# 使用示例
if __name__ == "__main__":
    from sklearn.datasets import make_regression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error, r2_score

    X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression(learning_rate=0.01, n_iterations=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"MSE: {mse:.2f}")
    print(f"R² Score: {r2:.2f}")
package main

import (
    "fmt"
    "math/rand"
)

type LinearRegression struct {
    learningRate float64
    nIterations  int
    weights      []float64
    bias         float64
}

func NewLinearRegression(lr float64, iters int) *LinearRegression {
    return &LinearRegression{
        learningRate: lr,
        nIterations:  iters,
    }
}

func (lr *LinearRegression) Fit(X [][]float64, y []float64) {
    nSamples := len(X)
    nFeatures := len(X[0])

    lr.weights = make([]float64, nFeatures)
    lr.bias = 0

    for iter := 0; iter < lr.nIterations; iter++ {
        yPred := make([]float64, nSamples)
        for i := 0; i < nSamples; i++ {
            sum := lr.bias
            for j := 0; j < nFeatures; j++ {
                sum += X[i][j] * lr.weights[j]
            }
            yPred[i] = sum
        }

        dw := make([]float64, nFeatures)
        var db float64

        for i := 0; i < nSamples; i++ {
            error := yPred[i] - y[i]
            db += error
            for j := 0; j < nFeatures; j++ {
                dw[j] += error * X[i][j]
            }
        }

        for j := 0; j < nFeatures; j++ {
            dw[j] /= float64(nSamples)
        }
        db /= float64(nSamples)

        lr.bias -= lr.learningRate * db
        for j := 0; j < nFeatures; j++ {
            lr.weights[j] -= lr.learningRate * dw[j]
        }
    }
}

func (lr *LinearRegression) Predict(X [][]float64) []float64 {
    predictions := make([]float64, len(X))
    for i, row := range X {
        sum := lr.bias
        for j, val := range row {
            sum += val * lr.weights[j]
        }
        predictions[i] = sum
    }
    return predictions
}

func main() {
    X := [][]float64{{1}, {2}, {3}, {4}, {5}}
    y := []float64{2, 4, 5, 4, 5}

    model := NewLinearRegression(0.01, 1000)
    model.Fit(X, y)

    preds := model.Predict([][]float64{{6}, {7}})
    fmt.Printf("Predictions: %v\n", preds)
}

📊 2. 逻辑回归 (Logistic Regression)

算法原理

逻辑回归用于二分类问题,使用 Sigmoid 函数将线性组合映射到 [0,1] 区间,输出属于某类的概率。

Sigmoid 函数: σ(z) = 1 / (1 + e⁻ᶻ)

预测概率: P(y=1|x) = σ(wx + b)

损失函数: L = -Σ[yᵢlog(ŷᵢ) + (1-yᵢ)log(1-ŷᵢ)]
训练时间复杂度
O(n × m × k)
预测时间复杂度
O(m)
空间复杂度
O(m)

✅ 优点

  • 输出概率,可解释性强
  • 计算代价低
  • 不易过拟合
  • 支持在线学习

❌ 缺点

  • 只能处理线性边界
  • 对特征相关性敏感
  • 特征不平衡时效果差
  • 需要特征缩放

代码实现

class LogisticRegression {
    constructor(learningRate = 0.01, nIterations = 1000) {
        this.learningRate = learningRate;
        this.nIterations = nIterations;
        this.weights = null;
        this.bias = null;
    }

    sigmoid(z) {
        return 1 / (1 + Math.exp(-z));
    }

    fit(X, y) {
        const nSamples = X.length;
        const nFeatures = X[0].length;

        this.weights = new Array(nFeatures).fill(0);
        this.bias = 0;

        for (let iter = 0; iter < this.nIterations; iter++) {
            const yPred = new Array(nSamples);

            for (let i = 0; i < nSamples; i++) {
                let sum = this.bias;
                for (let j = 0; j < nFeatures; j++) {
                    sum += X[i][j] * this.weights[j];
                }
                yPred[i] = this.sigmoid(sum);
            }

            let dw = new Array(nFeatures).fill(0);
            let db = 0;

            for (let i = 0; i < nSamples; i++) {
                const error = yPred[i] - y[i];
                db += error;
                for (let j = 0; j < nFeatures; j++) {
                    dw[j] += error * X[i][j];
                }
            }

            db /= nSamples;
            for (let j = 0; j < nFeatures; j++) {
                dw[j] /= nSamples;
            }

            this.bias -= this.learningRate * db;
            for (let j = 0; j < nFeatures; j++) {
                this.weights[j] -= this.learningRate * dw[j];
            }
        }
    }

    predict_proba(X) {
        return X.map(row => {
            let sum = this.bias;
            for (let j = 0; j < row.length; j++) {
                sum += row[j] * this.weights[j];
            }
            return this.sigmoid(sum);
        });
    }

    predict(X, threshold = 0.5) {
        return this.predict_proba(X).map(p => p >= threshold ? 1 : 0);
    }
}

// 使用示例
const X = [[1, 2], [2, 3], [3, 4], [4, 5]];
const y = [0, 0, 1, 1];

const model = new LogisticRegression(0.1, 10000);
model.fit(X, y);

console.log('预测概率:', model.predict_proba([[2.5, 3.5]]));
console.log('预测类别:', model.predict([[2.5, 3.5]]));
import numpy as np

class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape

        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(linear_model)

            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict_proba(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_model)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)

# 使用示例
if __name__ == "__main__":
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, classification_report

    X, y = make_classification(n_samples=100, n_features=2, n_informative=2,
                               n_redundant=0, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LogisticRegression(learning_rate=0.1, n_iterations=10000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred))
package main

import (
    "fmt"
    "math"
)

type LogisticRegression struct {
    learningRate float64
    nIterations  int
    weights      []float64
    bias         float64
}

func NewLogisticRegression(lr float64, iters int) *LogisticRegression {
    return &LogisticRegression{
        learningRate: lr,
        nIterations:  iters,
    }
}

func (lr *LogisticRegression) sigmoid(z float64) float64 {
    return 1 / (1 + math.Exp(-z))
}

func (lr *LogisticRegression) Fit(X [][]float64, y []float64) {
    nSamples := len(X)
    nFeatures := len(X[0])

    lr.weights = make([]float64, nFeatures)
    lr.bias = 0

    for iter := 0; iter < lr.nIterations; iter++ {
        yPred := make([]float64, nSamples)
        for i := 0; i < nSamples; i++ {
            sum := lr.bias
            for j := 0; j < nFeatures; j++ {
                sum += X[i][j] * lr.weights[j]
            }
            yPred[i] = lr.sigmoid(sum)
        }

        dw := make([]float64, nFeatures)
        var db float64

        for i := 0; i < nSamples; i++ {
            error := yPred[i] - y[i]
            db += error
            for j := 0; j < nFeatures; j++ {
                dw[j] += error * X[i][j]
            }
        }

        for j := 0; j < nFeatures; j++ {
            dw[j] /= float64(nSamples)
        }
        db /= float64(nSamples)

        lr.bias -= lr.learningRate * db
        for j := 0; j < nFeatures; j++ {
            lr.weights[j] -= lr.learningRate * dw[j]
        }
    }
}

func (lr *LogisticRegression) PredictProba(X [][]float64) []float64 {
    probs := make([]float64, len(X))
    for i, row := range X {
        sum := lr.bias
        for j, val := range row {
            sum += val * lr.weights[j]
        }
        probs[i] = lr.sigmoid(sum)
    }
    return probs
}

func (lr *LogisticRegression) Predict(X [][]float64, threshold float64) []int {
    probs := lr.PredictProba(X)
    predictions := make([]int, len(probs))
    for i, p := range probs {
        if p >= threshold {
            predictions[i] = 1
        } else {
            predictions[i] = 0
        }
    }
    return predictions
}

func main() {
    X := [][]float64{{1, 2}, {2, 3}, {3, 4}, {4, 5}}
    y := []float64{0, 0, 1, 1}

    model := NewLogisticRegression(0.1, 10000)
    model.Fit(X, y)

    probs := model.PredictProba([][]float64{{2.5, 3.5}})
    preds := model.Predict([][]float64{{2.5, 3.5}}, 0.5)
    
    fmt.Printf("预测概率:%v\n", probs)
    fmt.Printf("预测类别:%v\n", preds)
}
🎯 算法选择指南
  • 数据量小、特征少:逻辑回归、SVM
  • 数据量大、特征多:随机森林、梯度提升树
  • 需要可解释性:决策树、线性模型
  • 文本分类:朴素贝叶斯、逻辑回归
  • 特征间关系复杂:集成方法、神经网络
  • 快速原型开发:逻辑回归(基线模型)

🔷 3. 支持向量机 (SVM)

算法原理

SVM 通过寻找最大间隔超平面来进行分类,对于线性不可分数据,使用核技巧映射到高维空间。

优化目标: min (1/2)||w||²

约束条件: yᵢ(wxᵢ + b) ≥ 1

核函数: K(xᵢ, xⱼ) = φ(xᵢ)·φ(xⱼ)
训练时间复杂度
O(n² × m)
预测时间复杂度
O(k × m)
空间复杂度
O(k)

✅ 优点

  • 最大间隔,泛化能力强
  • 核技巧处理非线性
  • 高维空间表现好
  • 避免局部最优

❌ 缺点

  • 大规模数据训练慢
  • 核函数选择敏感
  • 多分类需要扩展
  • 缺失值敏感

代码实现

// 简化的 SVM 实现(使用梯度下降)
class SVM {
    constructor(learningRate = 0.001, lambdaParam = 0.01, nIterations = 1000) {
        this.learningRate = learningRate;
        this.lambdaParam = lambdaParam;
        this.nIterations = nIterations;
        this.weights = null;
        this.bias = null;
    }

    fit(X, y) {
        const nSamples = X.length;
        const nFeatures = X[0].length;

        this.weights = new Array(nFeatures).fill(0);
        this.bias = 0;

        // 将标签转换为 -1 和 1
        const yTransformed = y.map(label => label === 0 ? -1 : 1);

        for (let iter = 0; iter < this.nIterations; iter++) {
            for (let i = 0; i < nSamples; i++) {
                const condition = yTransformed[i] * (this._dotProduct(X[i], this.weights) + this.bias) >= 1;
                
                if (condition) {
                    // 梯度下降更新权重(正则化项)
                    for (let j = 0; j < nFeatures; j++) {
                        this.weights[j] -= this.learningRate * (2 * this.lambdaParam * this.weights[j]);
                    }
                    this.bias -= this.learningRate * 2 * this.lambdaParam * this.bias;
                } else {
                    // 梯度下降更新权重(损失项 + 正则化项)
                    for (let j = 0; j < nFeatures; j++) {
                        this.weights[j] -= this.learningRate * (2 * this.lambdaParam * this.weights[j] - yTransformed[i] * X[i][j]);
                    }
                    this.bias -= this.learningRate * (-yTransformed[i]);
                }
            }
        }
    }

    _dotProduct(arr1, arr2) {
        return arr1.reduce((sum, val, i) => sum + val * arr2[i], 0);
    }

    predict(X) {
        return X.map(row => {
            const linearModel = this._dotProduct(row, this.weights) + this.bias;
            return linearModel >= 0 ? 1 : 0;
        });
    }
}

// 使用示例
const X = [[1, 2], [2, 3], [3, 3], [4, 5], [5, 5]];
const y = [0, 0, 0, 1, 1];

const model = new SVM(0.001, 0.01, 1000);
model.fit(X, y);

console.log('预测类别:', model.predict([[3, 4], [4, 4]]));
import numpy as np

class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        self.weights = np.zeros(n_features)
        self.bias = 0

        # 将标签转换为 -1 和 1
        y_transformed = np.where(y <= 0, -1, 1)

        for _ in range(self.n_iterations):
            for idx, x_i in enumerate(X):
                condition = y_transformed[idx] * (np.dot(x_i, self.weights) - self.bias) >= 1
                
                if condition:
                    self.weights -= self.learning_rate * (2 * self.lambda_param * self.weights)
                else:
                    self.weights -= self.learning_rate * (2 * self.lambda_param * self.weights - np.dot(x_i, y_transformed[idx]))
                    self.bias -= self.learning_rate * y_transformed[idx]

    def predict(self, X):
        linear_output = np.dot(X, self.weights) - self.bias
        return np.where(linear_output >= 0, 1, 0)

# 使用示例
if __name__ == "__main__":
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    X, y = make_classification(n_samples=100, n_features=2, n_informative=2,
                               n_redundant=0, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = SVM(learning_rate=0.001, lambda_param=0.01, n_iterations=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
package main

import "fmt"

type SVM struct {
    learningRate float64
    lambdaParam  float64
    nIterations  int
    weights      []float64
    bias         float64
}

func NewSVM(lr, lambda float64, iters int) *SVM {
    return &SVM{
        learningRate: lr,
        lambdaParam:  lambda,
        nIterations:  iters,
    }
}

func (svm *SVM) dotProduct(a, b []float64) float64 {
    sum := 0.0
    for i := range a {
        sum += a[i] * b[i]
    }
    return sum
}

func (svm *SVM) Fit(X [][]float64, y []int) {
    nSamples := len(X)
    nFeatures := len(X[0])

    svm.weights = make([]float64, nFeatures)
    svm.bias = 0

    // 将标签转换为 -1 和 1
    yTransformed := make([]int, nSamples)
    for i, label := range y {
        if label <= 0 {
            yTransformed[i] = -1
        } else {
            yTransformed[i] = 1
        }
    }

    for iter := 0; iter < svm.nIterations; iter++ {
        for idx := 0; idx < nSamples; idx++ {
            xI := X[idx]
            condition := float64(yTransformed[idx]) * (svm.dotProduct(xI, svm.weights) - svm.bias) >= 1

            if condition {
                for j := 0; j < nFeatures; j++ {
                    svm.weights[j] -= svm.learningRate * (2 * svm.lambdaParam * svm.weights[j])
                }
            } else {
                for j := 0; j < nFeatures; j++ {
                    svm.weights[j] -= svm.learningRate * (2 * svm.lambdaParam * svm.weights[j] - float64(yTransformed[idx]) * xI[j])
                }
                svm.bias -= svm.learningRate * float64(yTransformed[idx])
            }
        }
    }
}

func (svm *SVM) Predict(X [][]float64) []int {
    predictions := make([]int, len(X))
    for i, row := range X {
        linearOutput := svm.dotProduct(row, svm.weights) - svm.bias
        if linearOutput >= 0 {
            predictions[i] = 1
        } else {
            predictions[i] = 0
        }
    }
    return predictions
}

func main() {
    X := [][]float64{{1, 2}, {2, 3}, {3, 3}, {4, 5}, {5, 5}}
    y := []int{0, 0, 0, 1, 1}

    model := NewSVM(0.001, 0.01, 1000)
    model.Fit(X, y)

    preds := model.Predict([][]float64{{3, 4}, {4, 4}})
    fmt.Printf("预测类别:%v\n", preds)
}

🌳 4. 决策树 (Decision Tree)

算法原理

决策树通过递归地选择最优特征进行数据划分,构建树形结构。常用划分准则包括信息增益、信息增益比和基尼指数。

信息熵: H(D) = -Σ pᵢ log₂ pᵢ

信息增益: IG(D, A) = H(D) - H(D|A)

基尼指数: Gini(D) = 1 - Σ pᵢ²
训练时间复杂度
O(n × m × d)
预测时间复杂度
O(d)
空间复杂度
O(nodes)

✅ 优点

  • 可解释性强,可视化直观
  • 不需要特征缩放
  • 能处理数值和类别特征
  • 自动特征选择

❌ 缺点

  • 容易过拟合
  • 对数据变化敏感
  • 可能不连续
  • 偏向取值多的特征

代码实现

class TreeNode {
    constructor(feature = null, threshold = null, left = null, right = null, value = null) {
        this.feature = feature;
        this.threshold = threshold;
        this.left = left;
        this.right = right;
        this.value = value;
    }
}

class DecisionTree {
    constructor(maxDepth = 10, minSamplesSplit = 2) {
        this.maxDepth = maxDepth;
        this.minSamplesSplit = minSamplesSplit;
        this.root = null;
    }

    fit(X, y) {
        this.nClasses = new Set(y).size;
        this.nFeatures = X[0].length;
        this.root = this._growTree(X, y);
    }

    _growTree(X, y, depth = 0) {
        const nSamples = X.length;
        const nLabels = new Set(y).size;

        // 停止条件
        if (depth >= this.maxDepth || nLabels === 1 || nSamples < this.minSamplesSplit) {
            const leafValue = this._mostCommonLabel(y);
            return new TreeNode(value = leafValue);
        }

        // 寻找最佳划分
        const { featureIdx, threshold } = this._bestSplit(X, y);

        // 划分数据
        const leftMask = X.map(row => row[featureIdx] <= threshold);
        const rightMask = leftMask.map(v => !v);

        const XLeft = X.filter((_, i) => leftMask[i]);
        const yLeft = y.filter((_, i) => leftMask[i]);
        const XRight = X.filter((_, i) => rightMask[i]);
        const yRight = y.filter((_, i) => rightMask[i]);

        // 递归构建子树
        const left = this._growTree(XLeft, yLeft, depth + 1);
        const right = this._growTree(XRight, yRight, depth + 1);

        return new TreeNode(featureIdx, threshold, left, right);
    }

    _bestSplit(X, y) {
        let bestGain = -Infinity;
        let bestFeature = 0;
        let bestThreshold = 0;

        const parentGini = this._giniIndex(y);

        for (let featureIdx = 0; featureIdx < this.nFeatures; featureIdx++) {
            const thresholds = [...new Set(X.map(row => row[featureIdx]))];

            for (const threshold of thresholds) {
                const gain = this._informationGain(y, X, featureIdx, threshold);
                if (gain > bestGain) {
                    bestGain = gain;
                    bestFeature = featureIdx;
                    bestThreshold = threshold;
                }
            }
        }

        return { featureIdx: bestFeature, threshold: bestThreshold };
    }

    _informationGain(parentY, X, featureIdx, threshold) {
        const leftMask = X.map(row => row[featureIdx] <= threshold);
        const n = parentY.length;
        const nLeft = leftMask.filter(v => v).length;

        if (nLeft === 0 || nLeft === n) return 0;

        const yLeft = parentY.filter((_, i) => leftMask[i]);
        const yRight = parentY.filter((_, i) => !leftMask[i]);

        return parentY.length - (nLeft * this._giniIndex(yLeft) + (n - nLeft) * this._giniIndex(yRight));
    }

    _giniIndex(y) {
        const counts = {};
        y.forEach(label => counts[label] = (counts[label] || 0) + 1);
        return 1 - Object.values(counts).reduce((sum, count) => sum + Math.pow(count / y.length, 2), 0);
    }

    _mostCommonLabel(y) {
        const counts = {};
        y.forEach(label => counts[label] = (counts[label] || 0) + 1);
        return Object.entries(counts).sort((a, b) => b[1] - a[1])[0][0];
    }

    predict(X) {
        return X.map(row => this._traverseTree(row, this.root));
    }

    _traverseTree(x, node) {
        if (node.value !== null) return node.value;

        if (x[node.feature] <= node.threshold) {
            return this._traverseTree(x, node.left);
        }
        return this._traverseTree(x, node.right);
    }
}

// 使用示例
const X = [[1, 2], [2, 3], [3, 4], [5, 6], [6, 7], [7, 8]];
const y = [0, 0, 0, 1, 1, 1];

const tree = new DecisionTree(maxDepth = 3);
tree.fit(X, y);

console.log('预测类别:', tree.predict([[4, 5], [6, 6]]));
class TreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        self.n_classes = len(set(y))
        self.n_features = X.shape[1]
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples = len(X)
        n_labels = len(set(y))

        # 停止条件
        if depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split:
            return TreeNode(value=self._most_common_label(y))

        # 寻找最佳划分
        feature_idx, threshold = self._best_split(X, y)

        # 划分数据
        left_mask = X[:, feature_idx] <= threshold
        X_left, y_left = X[left_mask], y[left_mask]
        X_right, y_right = X[~left_mask], y[~left_mask]

        # 递归构建子树
        left = self._grow_tree(X_left, y_left, depth + 1)
        right = self._grow_tree(X_right, y_right, depth + 1)

        return TreeNode(feature_idx, threshold, left, right)

    def _best_split(self, X, y):
        best_gain = -float('inf')
        best_feature = 0
        best_threshold = 0

        for feature_idx in range(self.n_features):
            thresholds = set(X[:, feature_idx])
            for threshold in thresholds:
                gain = self._information_gain(y, X, feature_idx, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_idx
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, y, X, feature_idx, threshold):
        left_mask = X[:, feature_idx] <= threshold
        y_left = y[left_mask]
        y_right = y[~left_mask]

        n = len(y)
        n_left = len(y_left)

        if n_left == 0 or n_left == n:
            return 0

        return self._gini_index(y) - (n_left / n * self._gini_index(y_left) + (n - n_left) / n * self._gini_index(y_right))

    def _gini_index(self, y):
        counts = {}
        for label in y:
            counts[label] = counts.get(label, 0) + 1
        return 1 - sum((count / len(y)) ** 2 for count in counts.values())

    def _most_common_label(self, y):
        counts = {}
        for label in y:
            counts[label] = counts.get(label, 0) + 1
        return max(counts, key=counts.get)

    def predict(self, X):
        return [self._traverse_tree(x, self.root) for x in X]

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

# 使用示例
if __name__ == "__main__":
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    import numpy as np

    X, y = make_classification(n_samples=100, n_features=4, n_informative=4, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    tree = DecisionTree(max_depth=5)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
package main

import "fmt"

type TreeNode struct {
    feature   int
    threshold float64
    left      *TreeNode
    right     *TreeNode
    value     *int
}

type DecisionTree struct {
    maxDepth        int
    minSamplesSplit int
    root            *TreeNode
    nFeatures       int
}

func NewDecisionTree(maxDepth, minSamplesSplit int) *DecisionTree {
    return &DecisionTree{
        maxDepth:        maxDepth,
        minSamplesSplit: minSamplesSplit,
    }
}

func (dt *DecisionTree) Fit(X [][]float64, y []int) {
    dt.nFeatures = len(X[0])
    dt.root = dt.growTree(X, y, 0)
}

func (dt *DecisionTree) growTree(X [][]float64, y []int, depth int) *TreeNode {
    nSamples := len(X)
    nLabels := len(dt.uniqueLabels(y))

    // 停止条件
    if depth >= dt.maxDepth || nLabels == 1 || nSamples < dt.minSamplesSplit {
        value := dt.mostCommonLabel(y)
        return &TreeNode{value: &value}
    }

    // 寻找最佳划分
    featureIdx, threshold := dt.bestSplit(X, y)

    // 划分数据
    var XLeft, XRight [][]float64
    var yLeft, yRight []int

    for i, row := range X {
        if row[featureIdx] <= threshold {
            XLeft = append(XLeft, row)
            yLeft = append(yLeft, y[i])
        } else {
            XRight = append(XRight, row)
            yRight = append(yRight, y[i])
        }
    }

    // 递归构建子树
    left := dt.growTree(XLeft, yLeft, depth+1)
    right := dt.growTree(XRight, yRight, depth+1)

    return &TreeNode{feature: featureIdx, threshold: threshold, left: left, right: right}
}

func (dt *DecisionTree) bestSplit(X [][]float64, y []int) (int, float64) {
    bestGain := -1.0
    bestFeature := 0
    bestThreshold := 0.0

    parentGini := dt.giniIndex(y)

    for featureIdx := 0; featureIdx < dt.nFeatures; featureIdx++ {
        thresholds := dt.uniqueThresholds(X, featureIdx)
        for _, threshold := range thresholds {
            gain := dt.informationGain(y, X, featureIdx, threshold)
            if gain > bestGain {
                bestGain = gain
                bestFeature = featureIdx
                bestThreshold = threshold
            }
        }
    }

    return bestFeature, bestThreshold
}

func (dt *DecisionTree) informationGain(y []int, X [][]float64, featureIdx int, threshold float64) float64 {
    var yLeft, yRight []int
    for i, row := range X {
        if row[featureIdx] <= threshold {
            yLeft = append(yLeft, y[i])
        } else {
            yRight = append(yRight, y[i])
        }
    }

    n := len(y)
    nLeft := len(yLeft)

    if nLeft == 0 || nLeft == n {
        return 0
    }

    return float64(n)*dt.giniIndex(y) - float64(nLeft)*dt.giniIndex(yLeft) - float64(n-nLeft)*dt.giniIndex(yRight)
}

func (dt *DecisionTree) giniIndex(y []int) float64 {
    counts := make(map[int]int)
    for _, label := range y {
        counts[label]++
    }

    gini := 1.0
    for _, count := range counts {
        p := float64(count) / float64(len(y))
        gini -= p * p
    }
    return gini
}

func (dt *DecisionTree) uniqueLabels(y []int) []int {
    unique := make(map[int]bool)
    for _, label := range y {
        unique[label] = true
    }
    result := make([]int, 0, len(unique))
    for label := range unique {
        result = append(result, label)
    }
    return result
}

func (dt *DecisionTree) uniqueThresholds(X [][]float64, featureIdx int) []float64 {
    unique := make(map[float64]bool)
    for _, row := range X {
        unique[row[featureIdx]] = true
    }
    result := make([]float64, 0, len(unique))
    for threshold := range unique {
        result = append(result, threshold)
    }
    return result
}

func (dt *DecisionTree) mostCommonLabel(y []int) int {
    counts := make(map[int]int)
    for _, label := range y {
        counts[label]++
    }

    maxCount := 0
    maxLabel := y[0]
    for label, count := range counts {
        if count > maxCount {
            maxCount = count
            maxLabel = label
        }
    }
    return maxLabel
}

func (dt *DecisionTree) Predict(X [][]float64) []int {
    predictions := make([]int, len(X))
    for i, row := range X {
        predictions[i] = dt.traverseTree(row, dt.root)
    }
    return predictions
}

func (dt *DecisionTree) traverseTree(x []float64, node *TreeNode) int {
    if node.value != nil {
        return *node.value
    }

    if x[node.feature] <= node.threshold {
        return dt.traverseTree(x, node.left)
    }
    return dt.traverseTree(x, node.right)
}

func main() {
    X := [][]float64{{1, 2}, {2, 3}, {3, 4}, {5, 6}, {6, 7}, {7, 8}}
    y := []int{0, 0, 0, 1, 1, 1}

    tree := NewDecisionTree(3, 2)
    tree.Fit(X, y)

    preds := tree.Predict([][]float64{{4, 5}, {6, 6}})
    fmt.Printf("预测类别:%v\n", preds)
}

🌲 5. 随机森林 (Random Forest)

算法原理

随机森林是集成学习算法,通过构建多棵决策树并综合它们的预测结果来提高准确性和稳定性。采用 Bagging 和随机特征选择。

Bagging: 有放回抽样构建训练集

随机特征: 每棵树随机选择 √m 个特征

投票机制: ŷ = mode{T₁(x), T₂(x), ..., Tₙ(x)}
训练时间复杂度
O(k × n × m × d)
预测时间复杂度
O(k × d)
空间复杂度
O(k × nodes)

✅ 优点

  • 抗过拟合能力强
  • 处理高维数据
  • 可评估特征重要性
  • 性能稳定准确

❌ 缺点

  • 训练时间较长
  • 可解释性较差
  • 内存占用较大
  • 小数据表现一般

代码实现

class RandomForest {
    constructor(nTrees = 100, maxDepth = 10, maxFeatures = null) {
        this.nTrees = nTrees;
        this.maxDepth = maxDepth;
        this.maxFeatures = maxFeatures;
        this.trees = [];
    }

    fit(X, y) {
        const nSamples = X.length;
        const nFeatures = X[0].length;
        this.maxFeatures = this.maxFeatures || Math.floor(Math.sqrt(nFeatures));

        this.trees = [];
        for (let i = 0; i < this.nTrees; i++) {
            // Bootstrap 采样
            const { XSample, ySample } = this._bootstrapSample(X, y);
            
            // 训练决策树
            const tree = this._growTree(XSample, ySample, 0, nFeatures);
            this.trees.push(tree);
        }
    }

    _bootstrapSample(X, y) {
        const nSamples = X.length;
        const XSample = [];
        const ySample = [];

        for (let i = 0; i < nSamples; i++) {
            const idx = Math.floor(Math.random() * nSamples);
            XSample.push(X[idx]);
            ySample.push(y[idx]);
        }

        return { XSample, ySample };
    }

    _growTree(X, y, depth, nFeatures) {
        const nSamples = X.length;
        const nLabels = new Set(y).size;

        if (depth >= this.maxDepth || nLabels === 1 || nSamples < 2) {
            return { value: this._mostCommonLabel(y) };
        }

        const { featureIdx, threshold } = this._bestSplit(X, y, nFeatures);

        const leftMask = X.map(row => row[featureIdx] <= threshold);
        const XLeft = X.filter((_, i) => leftMask[i]);
        const yLeft = y.filter((_, i) => leftMask[i]);
        const XRight = X.filter((_, i) => !leftMask[i]);
        const yRight = y.filter((_, i) => !leftMask[i]);

        return {
            feature: featureIdx,
            threshold,
            left: this._growTree(XLeft, yLeft, depth + 1, nFeatures),
            right: this._growTree(XRight, yRight, depth + 1, nFeatures)
        };
    }

    _bestSplit(X, y, nFeatures) {
        let bestGain = -Infinity;
        let bestFeature = 0;
        let bestThreshold = 0;

        // 随机选择特征
        const featureIndices = [];
        while (featureIndices.length < nFeatures) {
            const idx = Math.floor(Math.random() * X[0].length);
            if (!featureIndices.includes(idx)) featureIndices.push(idx);
        }

        for (const featureIdx of featureIndices) {
            const thresholds = [...new Set(X.map(row => row[featureIdx]))];
            for (const threshold of thresholds) {
                const gain = this._informationGain(y, X, featureIdx, threshold);
                if (gain > bestGain) {
                    bestGain = gain;
                    bestFeature = featureIdx;
                    bestThreshold = threshold;
                }
            }
        }

        return { featureIdx: bestFeature, threshold: bestThreshold };
    }

    _informationGain(parentY, X, featureIdx, threshold) {
        const leftMask = X.map(row => row[featureIdx] <= threshold);
        const n = parentY.length;
        const nLeft = leftMask.filter(v => v).length;

        if (nLeft === 0 || nLeft === n) return 0;

        const yLeft = parentY.filter((_, i) => leftMask[i]);
        const yRight = parentY.filter((_, i) => !leftMask[i]);

        return n - (nLeft * this._giniIndex(yLeft) + (n - nLeft) * this._giniIndex(yRight));
    }

    _giniIndex(y) {
        const counts = {};
        y.forEach(label => counts[label] = (counts[label] || 0) + 1);
        return 1 - Object.values(counts).reduce((sum, count) => sum + Math.pow(count / y.length, 2), 0);
    }

    _mostCommonLabel(y) {
        const counts = {};
        y.forEach(label => counts[label] = (counts[label] || 0) + 1);
        return Object.entries(counts).sort((a, b) => b[1] - a[1])[0][0];
    }

    predict(X) {
        // 每棵树预测,然后投票
        const predictions = X.map(row => {
            const treePreds = this.trees.map(tree => this._predictTree(row, tree));
            return this._mostCommonLabel(treePreds);
        });
        return predictions;
    }

    _predictTree(x, node) {
        if (node.value !== undefined) return node.value;
        if (x[node.feature] <= node.threshold) {
            return this._predictTree(x, node.left);
        }
        return this._predictTree(x, node.right);
    }
}

// 使用示例
const X = [[1, 2], [2, 3], [3, 4], [5, 6], [6, 7], [7, 8]];
const y = [0, 0, 0, 1, 1, 1];

const rf = new RandomForest(nTrees = 10);
rf.fit(X, y);

console.log('预测类别:', rf.predict([[4, 5], [6, 6]]));
import numpy as np
from collections import Counter

class RandomForest:
    def __init__(self, n_trees=100, max_depth=10, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        self.n_samples, self.n_features = X.shape
        self.max_features = self.max_features or int(np.sqrt(self.n_features))
        
        self.trees = []
        for _ in range(self.n_trees):
            # Bootstrap 采样
            X_sample, y_sample = self._bootstrap_sample(X, y)
            
            # 训练决策树
            tree = self._grow_tree(X_sample, y_sample, 0)
            self.trees.append(tree)

    def _bootstrap_sample(self, X, y):
        indices = np.random.choice(len(X), len(X), replace=True)
        return X[indices], y[indices]

    def _grow_tree(self, X, y, depth):
        n_samples = len(X)
        n_labels = len(set(y))

        if depth >= self.max_depth or n_labels == 1 or n_samples < 2:
            return {'value': self._most_common_label(y)}

        feature_idx, threshold = self._best_split(X, y)

        left_mask = X[:, feature_idx] <= threshold
        X_left, y_left = X[left_mask], y[left_mask]
        X_right, y_right = X[~left_mask], y[~left_mask]

        return {
            'feature': feature_idx,
            'threshold': threshold,
            'left': self._grow_tree(X_left, y_left, depth + 1),
            'right': self._grow_tree(X_right, y_right, depth + 1)
        }

    def _best_split(self, X, y):
        best_gain = -float('inf')
        best_feature = 0
        best_threshold = 0

        # 随机选择特征
        feature_indices = np.random.choice(self.n_features, self.max_features, replace=False)

        for feature_idx in feature_indices:
            thresholds = set(X[:, feature_idx])
            for threshold in thresholds:
                gain = self._information_gain(y, X, feature_idx, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_idx
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, y, X, feature_idx, threshold):
        left_mask = X[:, feature_idx] <= threshold
        y_left = y[left_mask]
        y_right = y[~left_mask]

        n = len(y)
        n_left = len(y_left)

        if n_left == 0 or n_left == n:
            return 0

        return self._gini_index(y) - (n_left / n * self._gini_index(y_left) + (n - n_left) / n * self._gini_index(y_right))

    def _gini_index(self, y):
        counts = Counter(y)
        return 1 - sum((count / len(y)) ** 2 for count in counts.values())

    def _most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]

    def predict(self, X):
        # 每棵树预测,然后投票
        predictions = [self._predict_tree(x, tree) for x in X for tree in self.trees]
        predictions = np.array(predictions).reshape(len(X), self.n_trees)
        return [Counter(row).most_common(1)[0][0] for row in predictions]

    def _predict_tree(self, x, node):
        if 'value' in node:
            return node['value']
        if x[node['feature']] <= node['threshold']:
            return self._predict_tree(x, node['left'])
        return self._predict_tree(x, node['right'])

# 使用示例
if __name__ == "__main__":
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    X, y = make_classification(n_samples=100, n_features=4, n_informative=4, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf = RandomForest(n_trees=100, max_depth=5)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
package main

import (
    "fmt"
    "math/rand"
    "time"
)

type TreeNode struct {
    feature   int
    threshold float64
    left      *TreeNode
    right     *TreeNode
    value     *int
}

type RandomForest struct {
    nTrees        int
    maxDepth      int
    maxFeatures   int
    nFeatures     int
    trees         []*TreeNode
}

func NewRandomForest(nTrees, maxDepth int) *RandomForest {
    rand.Seed(time.Now().UnixNano())
    return &RandomForest{
        nTrees:   nTrees,
        maxDepth: maxDepth,
    }
}

func (rf *RandomForest) Fit(X [][]float64, y []int) {
    rf.nFeatures = len(X[0])
    rf.maxFeatures = int(math.Sqrt(float64(rf.nFeatures)))
    rf.trees = make([]*TreeNode, 0, rf.nTrees)

    for i := 0; i < rf.nTrees; i++ {
        XSample, ySample := rf.bootstrapSample(X, y)
        tree := rf.growTree(XSample, ySample, 0)
        rf.trees = append(rf.trees, tree)
    }
}

func (rf *RandomForest) bootstrapSample(X [][]float64, y []int) ([][]float64, []int) {
    n := len(X)
    XSample := make([][]float64, n)
    ySample := make([]int, n)
    for i := 0; i < n; i++ {
        idx := rand.Intn(n)
        XSample[i] = X[idx]
        ySample[i] = y[idx]
    }
    return XSample, ySample
}

func (rf *RandomForest) growTree(X [][]float64, y []int, depth int) *TreeNode {
    nSamples := len(X)
    nLabels := len(rf.uniqueLabels(y))

    if depth >= rf.maxDepth || nLabels == 1 || nSamples < 2 {
        value := rf.mostCommonLabel(y)
        return &TreeNode{value: &value}
    }

    featureIdx, threshold := rf.bestSplit(X, y)

    var XLeft, XRight [][]float64
    var yLeft, yRight []int
    for i, row := range X {
        if row[featureIdx] <= threshold {
            XLeft = append(XLeft, row)
            yLeft = append(yLeft, y[i])
        } else {
            XRight = append(XRight, row)
            yRight = append(yRight, y[i])
        }
    }

    return &TreeNode{
        feature:   featureIdx,
        threshold: threshold,
        left:      rf.growTree(XLeft, yLeft, depth+1),
        right:     rf.growTree(XRight, yRight, depth+1),
    }
}

func (rf *RandomForest) bestSplit(X [][]float64, y []int) (int, float64) {
    bestGain := -1.0
    bestFeature := 0
    bestThreshold := 0.0

    // 随机选择特征
    featureIndices := make([]int, 0, rf.maxFeatures)
    for len(featureIndices) < rf.maxFeatures {
        idx := rand.Intn(rf.nFeatures)
        found := false
        for _, f := range featureIndices {
            if f == idx {
                found = true
                break
            }
        }
        if !found {
            featureIndices = append(featureIndices, idx)
        }
    }

    for _, featureIdx := range featureIndices {
        thresholds := rf.uniqueThresholds(X, featureIdx)
        for _, threshold := range thresholds {
            gain := rf.informationGain(y, X, featureIdx, threshold)
            if gain > bestGain {
                bestGain = gain
                bestFeature = featureIdx
                bestThreshold = threshold
            }
        }
    }

    return bestFeature, bestThreshold
}

func (rf *RandomForest) informationGain(y []int, X [][]float64, featureIdx int, threshold float64) float64 {
    var yLeft, yRight []int
    for i, row := range X {
        if row[featureIdx] <= threshold {
            yLeft = append(yLeft, y[i])
        } else {
            yRight = append(yRight, y[i])
        }
    }

    n := len(y)
    nLeft := len(yLeft)

    if nLeft == 0 || nLeft == n {
        return 0
    }

    return float64(n)*rf.giniIndex(y) - float64(nLeft)*rf.giniIndex(yLeft) - float64(n-nLeft)*rf.giniIndex(yRight)
}

func (rf *RandomForest) giniIndex(y []int) float64 {
    counts := make(map[int]int)
    for _, label := range y {
        counts[label]++
    }

    gini := 1.0
    for _, count := range counts {
        p := float64(count) / float64(len(y))
        gini -= p * p
    }
    return gini
}

func (rf *RandomForest) uniqueLabels(y []int) []int {
    unique := make(map[int]bool)
    for _, label := range y {
        unique[label] = true
    }
    result := make([]int, 0, len(unique))
    for label := range unique {
        result = append(result, label)
    }
    return result
}

func (rf *RandomForest) uniqueThresholds(X [][]float64, featureIdx int) []float64 {
    unique := make(map[float64]bool)
    for _, row := range X {
        unique[row[featureIdx]] = true
    }
    result := make([]float64, 0, len(unique))
    for threshold := range unique {
        result = append(result, threshold)
    }
    return result
}

func (rf *RandomForest) mostCommonLabel(y []int) int {
    counts := make(map[int]int)
    for _, label := range y {
        counts[label]++
    }

    maxCount := 0
    maxLabel := y[0]
    for label, count := range counts {
        if count > maxCount {
            maxCount = count
            maxLabel = label
        }
    }
    return maxLabel
}

func (rf *RandomForest) Predict(X [][]float64) []int {
    predictions := make([]int, len(X))
    for i, row := range X {
        votes := make(map[int]int)
        for _, tree := range rf.trees {
            pred := rf.predictTree(row, tree)
            votes[pred]++
        }

        maxVotes := 0
        maxLabel := 0
        for label, count := range votes {
            if count > maxVotes {
                maxVotes = count
                maxLabel = label
            }
        }
        predictions[i] = maxLabel
    }
    return predictions
}

func (rf *RandomForest) predictTree(x []float64, node *TreeNode) int {
    if node.value != nil {
        return *node.value
    }

    if x[node.feature] <= node.threshold {
        return rf.predictTree(x, node.left)
    }
    return rf.predictTree(x, node.right)
}

func main() {
    X := [][]float64{{1, 2}, {2, 3}, {3, 4}, {5, 6}, {6, 7}, {7, 8}}
    y := []int{0, 0, 0, 1, 1, 1}

    rf := NewRandomForest(10, 3)
    rf.Fit(X, y)

    preds := rf.Predict([][]float64{{4, 5}, {6, 6}})
    fmt.Printf("预测类别:%v\n", preds)
}

👥 6. K 近邻 (K-Nearest Neighbors)

算法原理

KNN 是一种基于实例的懒惰学习算法。对于新样本,找到训练集中 K 个最近的邻居,通过多数投票或平均来确定预测值。

欧氏距离: d(x, y) = √Σ(xᵢ - yᵢ)²

曼哈顿距离: d(x, y) = Σ|xᵢ - yᵢ|

闵可夫斯基距离: d(x, y) = (Σ|xᵢ - yᵢ|^p)^(1/p)
训练时间复杂度
O(1)
预测时间复杂度
O(n × m)
空间复杂度
O(n × m)

✅ 优点

  • 简单易懂,无需训练
  • 天然支持多分类
  • 对异常值不敏感
  • 适合复杂决策边界

❌ 缺点

  • 预测速度慢
  • 需要特征缩放
  • 高维效果差
  • 样本不平衡敏感

代码实现

class KNN {
    constructor(k = 3) {
        this.k = k;
        this.XTrain = [];
        this.yTrain = [];
    }

    fit(X, y) {
        this.XTrain = X;
        this.yTrain = y;
    }

    predict(X) {
        return X.map(x => this._predictOne(x));
    }

    _predictOne(x) {
        // 计算距离
        const distances = this.XTrain.map((xTrain, i) => ({
            distance: this._euclideanDistance(x, xTrain),
            label: this.yTrain[i]
        }));

        // 排序并取 K 个最近邻居
        distances.sort((a, b) => a.distance - b.distance);
        const kNearest = distances.slice(0, this.k);

        // 多数投票
        const votes = {};
        kNearest.forEach(item => {
            votes[item.label] = (votes[item.label] || 0) + 1;
        });

        return Object.entries(votes).sort((a, b) => b[1] - a[1])[0][0];
    }

    _euclideanDistance(x1, x2) {
        return Math.sqrt(x1.reduce((sum, val, i) => sum + Math.pow(val - x2[i], 2), 0));
    }
}

// 使用示例
const X = [[1, 2], [2, 3], [3, 4], [6, 7], [7, 8], [8, 9]];
const y = [0, 0, 0, 1, 1, 1];

const knn = new KNN(k = 3);
knn.fit(X, y);

console.log('预测类别:', knn.predict([[4, 5], [5, 6]]));
import numpy as np
from collections import Counter

class KNN:
    def __init__(self, k=3):
        self.k = k
        self.XTrain = None
        self.yTrain = None

    def fit(self, X, y):
        self.XTrain = np.array(X)
        self.yTrain = np.array(y)

    def predict(self, X):
        return [self._predict_one(x) for x in X]

    def _predict_one(self, x):
        # 计算距离
        distances = [self._euclidean_distance(x, xTrain) for xTrain in self.XTrain]
        
        # 获取 K 个最近邻居的标签
        kIndices = np.argsort(distances)[:self.k]
        kLabels = self.yTrain[kIndices]

        # 多数投票
        return Counter(kLabels).most_common(1)[0][0]

    def _euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((np.array(x1) - np.array(x2)) ** 2))

# 使用示例
if __name__ == "__main__":
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    X, y = make_classification(n_samples=100, n_features=4, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    knn = KNN(k=5)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
package main

import (
    "fmt"
    "math"
    "sort"
)

type KNN struct {
    k      int
    XTrain [][]float64
    yTrain []int
}

func NewKNN(k int) *KNN {
    return &KNN{k: k}
}

func (knn *KNN) Fit(X [][]float64, y []int) {
    knn.XTrain = X
    knn.yTrain = y
}

func (knn *KNN) Predict(X [][]float64) []int {
    predictions := make([]int, len(X))
    for i, x := range X {
        predictions[i] = knn.predictOne(x)
    }
    return predictions
}

func (knn *KNN) predictOne(x []float64) int {
    type neighbor struct {
        distance float64
        label    int
    }

    // 计算距离
    neighbors := make([]neighbor, len(knn.XTrain))
    for i, xTrain := range knn.XTrain {
        neighbors[i] = neighbor{
            distance: knn.euclideanDistance(x, xTrain),
            label:    knn.yTrain[i],
        }
    }

    // 排序并取 K 个最近邻居
    sort.Slice(neighbors, func(i, j int) bool {
        return neighbors[i].distance < neighbors[j].distance
    })

    // 多数投票
    votes := make(map[int]int)
    for i := 0; i < knn.k && i < len(neighbors); i++ {
        votes[neighbors[i].label]++
    }

    maxVotes := 0
    maxLabel := 0
    for label, count := range votes {
        if count > maxVotes {
            maxVotes = count
            maxLabel = label
        }
    }
    return maxLabel
}

func (knn *KNN) euclideanDistance(x1, x2 []float64) float64 {
    sum := 0.0
    for i := range x1 {
        diff := x1[i] - x2[i]
        sum += diff * diff
    }
    return math.Sqrt(sum)
}

func main() {
    X := [][]float64{{1, 2}, {2, 3}, {3, 4}, {6, 7}, {7, 8}, {8, 9}}
    y := []int{0, 0, 0, 1, 1, 1}

    knn := NewKNN(3)
    knn.Fit(X, y)

    preds := knn.Predict([][]float64{{4, 5}, {5, 6}})
    fmt.Printf("预测类别:%v\n", preds)
}

📧 7. 朴素贝叶斯 (Naive Bayes)

算法原理

朴素贝叶斯基于贝叶斯定理,假设特征之间相互独立。通过计算后验概率来进行分类,特别适合文本分类任务。

贝叶斯定理: P(Y|X) = P(X|Y) × P(Y) / P(X)

朴素假设: P(X|Y) = Π P(xᵢ|Y)

分类决策: ŷ = argmax P(Y) × Π P(xᵢ|Y)
训练时间复杂度
O(n × m)
预测时间复杂度
O(k × m)
空间复杂度
O(k × m)

✅ 优点

  • 训练预测都很快
  • 小样本表现好
  • 支持多分类
  • 文本分类效果好

❌ 缺点

  • 特征独立假设强
  • 需要估计先验概率
  • 对输入数据敏感
  • 零概率问题

代码实现

class NaiveBayes {
    constructor() {
        this.classes = [];
        this.mean = {};
        this.var = {};
        this.priors = {};
    }

    fit(X, y) {
        const nSamples = X.length;
        const nFeatures = X[0].length;

        this.classes = [...new Set(y)];
        
        for (const cls of this.classes) {
            const XCls = X.filter((_, i) => y[i] === cls);
            const prior = XCls.length / nSamples;

            // 计算均值和方差
            const mean = new Array(nFeatures).fill(0);
            const variance = new Array(nFeatures).fill(0);

            for (const row of XCls) {
                for (let j = 0; j < nFeatures; j++) {
                    mean[j] += row[j];
                }
            }
            for (let j = 0; j < nFeatures; j++) {
                mean[j] /= XCls.length;
            }

            for (const row of XCls) {
                for (let j = 0; j < nFeatures; j++) {
                    variance[j] += Math.pow(row[j] - mean[j], 2);
                }
            }
            for (let j = 0; j < nFeatures; j++) {
                variance[j] /= XCls.length;
            }

            this.mean[cls] = mean;
            this.var[cls] = variance;
            this.priors[cls] = prior;
        }
    }

    predict(X) {
        return X.map(x => {
            let maxPosterior = -Infinity;
            let predictedClass = this.classes[0];

            for (const cls of this.classes) {
                const posterior = this._calculatePosterior(x, cls);
                if (posterior > maxPosterior) {
                    maxPosterior = posterior;
                    predictedClass = cls;
                }
            }
            return predictedClass;
        });
    }

    _calculatePosterior(x, cls) {
        let logPosterior = Math.log(this.priors[cls]);

        for (let j = 0; j < x.length; j++) {
            const mean = this.mean[cls][j];
            const variance = this.var[cls][j];
            
            // 高斯概率密度函数的对数
            logPosterior -= 0.5 * Math.log(2 * Math.PI * variance);
            logPosterior -= Math.pow(x[j] - mean, 2) / (2 * variance);
        }

        return logPosterior;
    }
}

// 使用示例
const X = [[1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [6.0, 7.0], [7.0, 8.0], [8.0, 9.0]];
const y = [0, 0, 0, 1, 1, 1];

const nb = new NaiveBayes();
nb.fit(X, y);

console.log('预测类别:', nb.predict([[4.0, 5.0], [5.0, 6.0]]));
import numpy as np

class NaiveBayes:
    def __init__(self):
        self.classes = None
        self.mean = {}
        self.var = {}
        self.priors = {}

    def fit(self, X, y):
        self.classes = np.unique(y)
        nSamples, nFeatures = X.shape

        for cls in self.classes:
            XCls = X[y == cls]
            self.priors[cls] = len(XCls) / nSamples
            self.mean[cls] = np.mean(XCls, axis=0)
            self.var[cls] = np.var(XCls, axis=0) + 1e-9  # 防止除零

    def predict(self, X):
        return [self._predict_one(x) for x in X]

    def _predict_one(self, x):
        posteriors = []

        for cls in self.classes:
            prior = np.log(self.priors[cls])
            posterior = np.sum(np.log(self._gaussian_pdf(x, self.mean[cls], self.var[cls])))
            posteriors.append(prior + posterior)

        return self.classes[np.argmax(posteriors)]

    def _gaussian_pdf(self, x, mean, var):
        return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-((x - mean) ** 2) / (2 * var))

# 使用示例
if __name__ == "__main__":
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    X, y = make_classification(n_samples=100, n_features=4, n_informative=4, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
package main

import (
    "fmt"
    "math"
)

type NaiveBayes struct {
    classes []int
    mean    map[int][]float64
    var     map[int][]float64
    priors  map[int]float64
}

func NewNaiveBayes() *NaiveBayes {
    return &NaiveBayes{
        mean:   make(map[int][]float64),
        var:    make(map[int][]float64),
        priors: make(map[int]float64),
    }
}

func (nb *NaiveBayes) Fit(X [][]float64, y []int) {
    nSamples := len(X)
    nFeatures := len(X[0])

    // 获取唯一类别
    classSet := make(map[int]bool)
    for _, label := range y {
        classSet[label] = true
    }

    nb.classes = make([]int, 0, len(classSet))
    for cls := range classSet {
        nb.classes = append(nb.classes, cls)
    }

    for _, cls := range nb.classes {
        var XCls [][]float64
        for i, label := range y {
            if label == cls {
                XCls = append(XCls, X[i])
            }
        }

        nb.priors[cls] = float64(len(XCls)) / float64(nSamples)

        // 计算均值
        mean := make([]float64, nFeatures)
        for _, row := range XCls {
            for j, val := range row {
                mean[j] += val
            }
        }
        for j := range mean {
            mean[j] /= float64(len(XCls))
        }
        nb.mean[cls] = mean

        // 计算方差
        variance := make([]float64, nFeatures)
        for _, row := range XCls {
            for j, val := range row {
                variance[j] += (val - mean[j]) * (val - mean[j])
            }
        }
        for j := range variance {
            variance[j] /= float64(len(XCls))
            variance[j] += 1e-9 // 防止除零
        }
        nb.var[cls] = variance
    }
}

func (nb *NaiveBayes) Predict(X [][]float64) []int {
    predictions := make([]int, len(X))
    for i, x := range X {
        predictions[i] = nb.predictOne(x)
    }
    return predictions
}

func (nb *NaiveBayes) predictOne(x []float64) int {
    maxPosterior := -math.MaxFloat64
    predictedClass := nb.classes[0]

    for _, cls := range nb.classes {
        posterior := math.Log(nb.priors[cls])

        for j := range x {
            posterior += math.Log(nb.gaussianPdf(x[j], nb.mean[cls][j], nb.var[cls][j]))
        }

        if posterior > maxPosterior {
            maxPosterior = posterior
            predictedClass = cls
        }
    }
    return predictedClass
}

func (nb *NaiveBayes) gaussianPdf(x, mean, variance float64) float64 {
    return (1 / math.Sqrt(2*math.Pi*variance)) * math.Exp(-math.Pow(x-mean, 2)/(2*variance))
}

func main() {
    X := [][]float64{{1.0, 2.0}, {2.0, 3.0}, {3.0, 4.0}, {6.0, 7.0}, {7.0, 8.0}, {8.0, 9.0}}
    y := []int{0, 0, 0, 1, 1, 1}

    nb := NewNaiveBayes()
    nb.Fit(X, y)

    preds := nb.Predict([][]float64{{4.0, 5.0}, {5.0, 6.0}})
    fmt.Printf("预测类别:%v\n", preds)
}