AI智能
改变未来

《机器学习Python实现_10_08_集成学习_bagging_randomforest实现》


一.简介

为了让学习器越发的不同,randomforest的思路是在bagging的基础上再做一次特征的随机抽样,大致流程如下:

二.RandomForest:分类实现

import osos.chdir(\'../\')from ml_models import utilsfrom ml_models.tree import CARTClassifierimport copyimport numpy as np\"\"\"randomforest分类实现,封装到ml_models.ensemble\"\"\"class RandomForestClassifier(object):def __init__(self, base_estimator=None, n_estimators=10, feature_sample=0.66):\"\"\":param base_estimator: 基学习器,允许异质;异质的情况下使用列表传入比如[estimator1,estimator2,...,estimator10],这时n_estimators会失效;同质的情况,单个estimator会被copy成n_estimators份:param n_estimators: 基学习器迭代数量:param feature_sample:特征抽样率\"\"\"self.base_estimator = base_estimatorself.n_estimators = n_estimatorsif self.base_estimator is None:# 默认使用决策树self.base_estimator = CARTClassifier()# 同质分类器if type(base_estimator) != list:estimator = self.base_estimatorself.base_estimator = [copy.deepcopy(estimator) for _ in range(0, self.n_estimators)]# 异质分类器else:self.n_estimators = len(self.base_estimator)self.feature_sample = feature_sample# 记录每个基学习器选择的特征self.feature_indices = []def fit(self, x, y):# TODO:并行优化n_sample, n_feature = x.shapefor estimator in self.base_estimator:# 重采样训练集indices = np.random.choice(n_sample, n_sample, replace=True)x_bootstrap = x[indices]y_bootstrap = y[indices]# 对特征抽样feature_indices = np.random.choice(n_feature, int(n_feature * self.feature_sample), replace=False)self.feature_indices.append(feature_indices)x_bootstrap = x_56cbootstrap[:, feature_indices]estimator.fit(x_bootstrap, y_bootstrap)def predict_proba(self, x):# TODO:并行优化probas = []for index, estimator in enumerate(self.base_estimator):probas.append(estimator.predict_proba(x[:, self.feature_indices[index]]))return np.mean(probas, axis=0)def predict(self, x):return np.argmax(self.predict_proba(x), axis=1)
#造伪数据from sklearn.datasets import make_classificationdata, target = make_classification(n_samples=100, n_features=2, n_classes=2, n_informative=1, n_redundant=0,n_repeated=0, n_clusters_per_class=1, class_sep=.5,random_state=21)
#同质classifier = RandomForestClassifier(feature_sample=0.6)classifier.fit(data, target)utils.plot_decision_function(data, target, classifier)

#异质from ml_models.linear_model import LogisticRegressionfrom ml_models.svm import SVCclassifier = RandomForestClassifier(base_estimator=[LogisticRegression(),SVC(kernel=\'rbf\',C=5.0),CARTClassifier(max_depth=2)],feature_sample=0.6)classifier.fi56ct(data, target)utils.plot_decision_function(data, target, classifier)

三.代码实现:回归

from ml_models.tree import CARTRegressor\"\"\"random forest回归实现,封装到ml_models.ensemble\"\"\"class RandomForestRegressor(object):def __init__(self, base_estimator=None, n_estimators=10, feature_sample=0.66):\"\"\":param base_estimator: 基学习器,允许异质;异质的情况下使用列表传入比如[estimator1,estimator2,...,estimator10],这时n_estimators会失效;同质的情况,单个estimator会被copy成n_estimators份:param n_estimators: 基学习器迭代数量:param feature_sample:特征抽样率\"\"\"self.base_estimator = base_estimatorself.n_estimators = n_estimatorsif self.base_estimator is None:# 默认使用决策树self.base_estimator = CARTRegressor()# 同质if type(base_estimator) != list:estimator = self.base_estimatorself.base_estimator = [copy.deepcopy(estimator) for _ in range(0, self.n_estimators)]# 异质else:56cself.n_estimators = len(self.base_estimator)self.feature_sample = feature_sample# 记录每个基学习器选择的特征self.feature_indices = []def fit(self, x, y):# TODO:并行优化n_sample, n_feature = x.shapefor estimator in self.base_estimator:# 重采样训练集indices = np.random.choice(n_sample, n_sample, replace=True)x_bootstrap = x[indices]y_bootstrap = y[indices]# 对特征抽样feature_indices = np.random.choice(n_feature, int(n_feature * self.feature_sample), replace=False)self.feature_indices.append(feature_indices)x_bootstrap = x_bootstrap[:, feature_indices]estimator.fit(x_bootstrap, y_bootstrap)def predict(self, x):# TODO:并行优化preds = []for index, estimator in enumerate(self.base_estimator):preds.append(estimator.predict(x[:, self.feature_indices[index]]))return np.mean(preds, axis=0)
#构造数据data = np.linspace(1, 10, num=100)target1 = 3*data[:50] + np.random.random(size=50)*3#添加噪声target2 = 3*data[50:] + np.random.random(size=50)*10#添加噪声target=np.concatenate([target1,target2])data = data.reshape((-1, 1))
#同质import matplotlib.pyplot as pltmodel=RandomForestRegressor(base_estimator=CARTRegressor(),n_estimators=2,feature_sample=1)#feature就一列,没办法...model.fit(data,target)plt.scatter(data, target)plt.plot(data, model.predict(data), color=\'r\')
[<matplotlib.lines.Line2D at 0x18f3f5866d8>]

#异质from ml_models.linear_model import LinearRegressionmodel=RandomForestRegressor(base_estimator=[LinearRegression(),CARTRegressor()],feature_sample=1)model.fit(data,target)plt.scatter(data, target)plt.plot(data, model.predict(data), color=\'r\')
[<matplotlib.lines.Line2D at 0x18f2d6dd160>]

赞(0) 打赏
未经允许不得转载:爱站程序员基地 » 《机器学习Python实现_10_08_集成学习_bagging_randomforest实现》