天池新人赛-工业蒸汽量预测-排名150
发布日期:2021-06-29 19:49:15 浏览次数:2 分类:技术文章

本文共 35142 字,大约阅读时间需要 117 分钟。

赛题背景

火力发电的基本原理是:燃料在燃烧时加热水生成蒸汽,蒸汽压力推动汽轮机旋转,然后汽轮机带动发电机旋转,产生电能。在这一系列的能量转化中,影响发电效率的核心是锅炉的燃烧效率,即燃料燃烧加热水产生高温高压蒸汽。锅炉的燃烧效率的影响因素很多,包括锅炉的可调参数,如燃烧给量,一二次风,引风,返料风,给水水量;以及锅炉的工况,比如锅炉床温、床压,炉膛温度、压力,过热器的温度等。

赛题描述

经脱敏后的锅炉传感器采集的数据(采集频率是分钟级别),根据锅炉的工况,预测产生的蒸汽量。

数据说明

数据分成训练数据(train.txt)和测试数据(test.txt),其中字段”V0”-“V37”,这38个字段是作为特征变量,”target”作为目标变量。选手利用训练数据训练出模型,预测测试数据的目标变量,排名结果依据预测结果的MSE(mean square error)。

具体请移步:

具体思路:

画图查看每一个特征的训练集和测试集是否服从同一分布,不符合则删除此特征
合理的使用方差去筛选特征
使用岭模型Ridge查找和删除异常值
使用这三个RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor去stacking

在这里插入图片描述

在此分享三个模型

模型一、分数未测试大概0.11-0.15之间

import osimport warnings# 可自行修改from utils.read_write import writeOneCsvwarnings.filterwarnings("ignore")import matplotlib.pyplot as pltplt.rcParams.update({
'figure.max_open_warning': 0})import seaborn as sns# modellingimport pandas as pdimport numpy as npfrom scipy import statsfrom sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_scorefrom sklearn.metrics import mean_squared_errorfrom sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressoros.chdir(r'E:\项目文件\工业蒸汽量预测\\')# load_datasetwith open("zhengqi_train.txt") as fr: data_train = pd.read_table(fr, sep="\t")with open("zhengqi_test.txt") as fr_test: data_test = pd.read_table(fr_test, sep="\t")# merge train_set and test_setdata_train["oringin"] = "train"data_test["oringin"] = "test"data_all = pd.concat([data_train, data_test], axis=0, ignore_index=True)# View datadata_all.head()# Explore feature distibution# fig = plt.figure(figsize=(6, 6))def plot_train_test(): for column in data_all.columns[0:-2]: g = sns.kdeplot(data_all[column][(data_all["oringin"] == "train")], color="Red", shade=True) g = sns.kdeplot(data_all[column][(data_all["oringin"] == "test")], ax=g, color="Blue", shade=True) g.set_xlabel(column) g.set_ylabel("Frequency") g = g.legend(["train", "test"]) plt.show()fig = plt.figure(figsize=(10, 10))for i in range(len(data_all.columns) - 2): g = sns.FacetGrid(data_all, col='oringin') g = g.map(sns.distplot, data_all.columns[i])def del_plot(): # 删除特征"V5","V9","V11","V17","V22","V28",训练集和测试集分布不均 for column in ["V5", "V9", "V11", "V17", "V22", "V28"]: g = sns.kdeplot(data_all[column][(data_all["oringin"] == "train")], color="Red", shade=True) g = sns.kdeplot(data_all[column][(data_all["oringin"] == "test")], ax=g, color="Blue", shade=True) g.set_xlabel(column) g.set_ylabel("Frequency") g = g.legend(["train", "test"]) plt.show()data_all.drop(["V5", "V9", "V11", "V17", "V22", "V28"], axis=1, inplace=True)# figure parametersdata_train1 = data_all[data_all["oringin"] == "train"].drop("oringin", axis=1)fcols = 2frows = len(data_train.columns)plt.figure(figsize=(5 * fcols, 4 * frows))i = 0for col in data_train1.columns: i += 1 ax = plt.subplot(frows, fcols, i) sns.regplot(x=col, y='target', data=data_train, ax=ax, scatter_kws={
'marker': '.', 's': 3, 'alpha': 0.3}, line_kws={
'color': 'k'}) plt.xlabel(col) plt.ylabel('target') i += 1 ax = plt.subplot(frows, fcols, i) sns.distplot(data_train[col].dropna(), fit=stats.norm) plt.xlabel(col)# 去除相关变量的阈值threshold = 0.1# Absolute value correlation matrixcorr_matrix = data_train1.corr().abs()drop_col = corr_matrix[corr_matrix["target"] < threshold].indexdata_all.drop(drop_col, axis=1, inplace=True)# normalise numeric columnscols_numeric = list(data_all.columns)cols_numeric.remove("oringin")def scale_minmax(col): return (col - col.min()) / (col.max() - col.min())scale_cols = [col for col in cols_numeric if col != 'target']data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax, axis=0)data_all[scale_cols].describe()# Check effect of Box-Cox transforms on distributions of continuous variablesfcols = 6frows = len(cols_numeric) - 1plt.figure(figsize=(4 * fcols, 4 * frows))i = 0for var in cols_numeric: if var != 'target': dat = data_all[[var, 'target']].dropna() i += 1 plt.subplot(frows, fcols, i) sns.distplot(dat[var], fit=stats.norm) plt.title(var + ' Original') plt.xlabel('') i += 1 plt.subplot(frows, fcols, i) _ = stats.probplot(dat[var], plot=plt) plt.title('skew=' + '{:.4f}'.format(stats.skew(dat[var]))) plt.xlabel('') plt.ylabel('') i += 1 plt.subplot(frows, fcols, i) plt.plot(dat[var], dat['target'], '.', alpha=0.5) plt.title('corr=' + '{:.2f}'.format(np.corrcoef(dat[var], dat['target'])[0][1])) i += 1 plt.subplot(frows, fcols, i) trans_var, lambda_var = stats.boxcox(dat[var].dropna() + 1) trans_var = scale_minmax(trans_var) sns.distplot(trans_var, fit=stats.norm) plt.title(var + ' Tramsformed') plt.xlabel('') i += 1 plt.subplot(frows, fcols, i) _ = stats.probplot(trans_var, plot=plt) plt.title('skew=' + '{:.4f}'.format(stats.skew(trans_var))) plt.xlabel('') plt.ylabel('') i += 1 plt.subplot(frows, fcols, i) plt.plot(trans_var, dat['target'], '.', alpha=0.5) plt.title('corr=' + '{:.2f}'.format(np.corrcoef(trans_var, dat['target'])[0][1]))cols_transform = data_all.columns[0:-2]for col in cols_transform: # transform column data_all.loc[:, col], _ = stats.boxcox(data_all.loc[:, col] + 1)print(data_all.target.describe())plt.figure(figsize=(12, 4))plt.subplot(1, 2, 1)sns.distplot(data_all.target.dropna(), fit=stats.norm)plt.subplot(1, 2, 2)_ = stats.probplot(data_all.target.dropna(), plot=plt)# Log Transform SalePrice to improve normalitysp = data_train.targetdata_train.target1 = np.power(1.5, sp)print(data_train.target1.describe())plt.figure(figsize=(12, 4))plt.subplot(1, 2, 1)sns.distplot(data_train.target1.dropna(), fit=stats.norm)plt.subplot(1, 2, 2)_ = stats.probplot(data_train.target1.dropna(), plot=plt)def show_plot(): for column in data_all.columns[0:-2]: g = sns.kdeplot(data_all[column][(data_all["oringin"] == "train")], color="Red", shade=True) g = sns.kdeplot(data_all[column][(data_all["oringin"] == "test")], ax=g, color="Blue", shade=True) g.set_xlabel(column) g.set_ylabel("Frequency") g = g.legend(["train", "test"]) plt.show()# function to get training samplesdef get_training_data(): # extract training samples from sklearn.model_selection import train_test_split df_train = data_all[data_all["oringin"] == "train"] df_train["label"] = data_train.target1 # split SalePrice and features y = df_train.target X = df_train.drop(["oringin", "target", "label"], axis=1) X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=100) return X_train, X_valid, y_train, y_valid# extract test data (without SalePrice)def get_test_data(): df_test = data_all[data_all["oringin"] == "test"].reset_index(drop=True) return df_test.drop(["oringin", "target"], axis=1)from sklearn.metrics import make_scorer# metric for evaluationdef rmse(y_true, y_pred): diff = y_pred - y_true sum_sq = sum(diff ** 2) n = len(y_pred) return np.sqrt(sum_sq / n)def mse(y_ture, y_pred): return mean_squared_error(y_ture, y_pred)# scorer to be used in sklearn model fittingrmse_scorer = make_scorer(rmse, greater_is_better=False)mse_scorer = make_scorer(mse, greater_is_better=False)# function to detect outliers based on the predictions of a modeldef find_outliers(model, X, y, sigma=3): # predict y values using model try: y_pred = pd.Series(model.predict(X), index=y.index) # if predicting fails, try fitting the model first except: model.fit(X, y) y_pred = pd.Series(model.predict(X), index=y.index) # calculate residuals between the model prediction and true y values resid = y - y_pred mean_resid = resid.mean() std_resid = resid.std() # calculate z statistic, define outliers to be where |z|>sigma z = (resid - mean_resid) / std_resid outliers = z[abs(z) > sigma].index # print and plot the results print('R2=', model.score(X, y)) print('rmse=', rmse(y, y_pred)) print("mse=", mean_squared_error(y, y_pred)) print('---------------------------------------') print('mean of residuals:', mean_resid) print('std of residuals:', std_resid) print('---------------------------------------') print(len(outliers), 'outliers:') print(outliers.tolist()) plt.figure(figsize=(15, 5)) ax_131 = plt.subplot(1, 3, 1) plt.plot(y, y_pred, '.') plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro') plt.legend(['Accepted', 'Outlier']) plt.xlabel('y') plt.ylabel('y_pred') ax_132 = plt.subplot(1, 3, 2) plt.plot(y, y - y_pred, '.') plt.plot(y.loc[outliers], y.loc[outliers] - y_pred.loc[outliers], 'ro') plt.legend(['Accepted', 'Outlier']) plt.xlabel('y') plt.ylabel('y - y_pred') ax_133 = plt.subplot(1, 3, 3) z.plot.hist(bins=50, ax=ax_133) z.loc[outliers].plot.hist(color='r', bins=50, ax=ax_133) plt.legend(['Accepted', 'Outlier']) plt.xlabel('z') plt.savefig('outliers.png') return outliers# get training datafrom sklearn.linear_model import RidgeX_train, X_valid, y_train, y_valid = get_training_data()test = get_test_data()# find and remove outliers using a Ridge modeloutliers = find_outliers(Ridge(), X_train, y_train)# permanently remove these outliers from the data# df_train = data_all[data_all["oringin"]=="train"]# df_train["label"]=data_train.target1# df_train=df_train.drop(outliers)X_outliers = X_train.loc[outliers]y_outliers = y_train.loc[outliers]X_t = X_train.drop(outliers)y_t = y_train.drop(outliers)def get_trainning_data_omitoutliers(): y1 = y_t.copy() X1 = X_t.copy() return X1, y1def train_model(model, param_grid, X=[], y=[], splits=5, repeats=5): # get unmodified training data, unless data to use already specified if len(y) == 0: X, y = get_trainning_data_omitoutliers() # poly_trans=PolynomialFeatures(degree=2) # X=poly_trans.fit_transform(X) # X=MinMaxScaler().fit_transform(X) # create cross-validation method rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats) # perform a grid search if param_grid given if len(param_grid) > 0: # setup grid search parameters gsearch = GridSearchCV(model, param_grid, cv=rkfold, scoring="neg_mean_squared_error", verbose=1, return_train_score=True) # search the grid gsearch.fit(X, y) # extract best model from the grid model = gsearch.best_estimator_ best_idx = gsearch.best_index_ # get cv-scores for best model grid_results = pd.DataFrame(gsearch.cv_results_) cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score']) cv_std = grid_results.loc[best_idx, 'std_test_score'] # no grid search, just cross-val score for given model else: grid_results = [] cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold) cv_mean = abs(np.mean(cv_results)) cv_std = np.std(cv_results) # combine mean and std cv-score in to a pandas series cv_score = pd.Series({
'mean': cv_mean, 'std': cv_std}) # predict y using the fitted model y_pred = model.predict(X) # print stats on model performance print('----------------------') print(model) print('----------------------') print('score=', model.score(X, y)) print('rmse=', rmse(y, y_pred)) print('mse=', mse(y, y_pred)) print('cross_val: mean=', cv_mean, ', std=', cv_std) # residual plots y_pred = pd.Series(y_pred, index=y.index) resid = y - y_pred mean_resid = resid.mean() std_resid = resid.std() z = (resid - mean_resid) / std_resid n_outliers = sum(abs(z) > 3) plt.figure(figsize=(15, 5)) ax_131 = plt.subplot(1, 3, 1) plt.plot(y, y_pred, '.') plt.xlabel('y') plt.ylabel('y_pred') plt.title('corr = {:.3f}'.format(np.corrcoef(y, y_pred)[0][1])) ax_132 = plt.subplot(1, 3, 2) plt.plot(y, y - y_pred, '.') plt.xlabel('y') plt.ylabel('y - y_pred') plt.title('std resid = {:.3f}'.format(std_resid)) ax_133 = plt.subplot(1, 3, 3) z.plot.hist(bins=50, ax=ax_133) plt.xlabel('z') plt.title('{:.0f} samples with z>3'.format(n_outliers)) return model, cv_score, grid_results# places to store optimal models and scoresopt_models = dict()score_models = pd.DataFrame(columns=['mean', 'std'])# no. k-fold splitssplits = 5# no. k-fold iterationsrepeats = 5model = 'GradientBoosting'opt_models[model] = GradientBoostingRegressor()param_grid = {
'learning_rate': [0.75], 'max_depth': range(22, 30, 9), 'n_estimators': range(66, 85, 9)}opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=1)print(grid_results)cv_score.name = modelscore_models = score_models.append(cv_score)model = 'ExtraTreesRegressor'opt_models[model] = ExtraTreesRegressor()param_grid = {
'max_depth': range(22, 39, 9), 'n_estimators': range(88, 99, 9),}opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=1)print(grid_results)cv_score.name = modelscore_models = score_models.append(cv_score)model = 'RandomForest'opt_models[model] = RandomForestRegressor()param_grid = {
'max_depth': range(33, 35, 9), 'n_estimators': range(73, 77, 9),}opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=5, repeats=1)print(grid_results)cv_score.name = modelscore_models = score_models.append(cv_score)def model_predict(test_data, test_y=[], stack=False): # poly_trans=PolynomialFeatures(degree=2) # test_data1=poly_trans.fit_transform(test_data) # test_data=MinMaxScaler().fit_transform(test_data) i = 0 y_predict_total = np.zeros((test_data.shape[0],)) if stack: for model in metal_models.keys(): y_predict = metal_models[model].predict(test_data) y_predict_total += y_predict i += 1 if len(test_y) > 0: print("{}_mse:".format(model), mean_squared_error(y_predict, test_y)) # y_predict_mean = np.round(y_predict_total / i, 3) y_predict_mean = y_predict_total for i in range(0, y_predict_mean.shape[0]): y_predict_mean[i] = round(y_predict_mean[i], 6) print(y_predict_mean[i]) writeOneCsv([y_predict_mean[i]], 'tianchi.txt') if len(test_y) > 0: print("mean_mse:", mean_squared_error(y_predict_mean, test_y)) else: y_metal_mean = pd.Series(y_predict_mean) return y_metal_mean else: for model in opt_models.keys(): y_predict = opt_models[model].predict(test_data) y_predict_total += y_predict i += 1 if len(test_y) > 0: print("{}_mse:".format(model), mean_squared_error(y_predict, test_y)) y_predict_mean = np.round(y_predict_total / i, 3) if len(test_y) > 0: print("mean_mse:", mean_squared_error(y_predict_mean, test_y)) else: y_predict_mean = pd.Series(y_predict_mean) return y_predict_meanmodel_predict(X_valid, y_valid)def create_stack_features(test_data): features = {
} columns = [] for model in opt_models.keys(): columns.append(model) features[model] = opt_models[model].predict(test_data) stack_feature = pd.DataFrame(features, columns=columns) return stack_feature# load metal datametal_x_train = create_stack_features(X_t)metal_y_train = pd.Series(y_t.values)metal_x_valid = create_stack_features(X_valid)metal_y_valid = pd.Series(y_valid.values)metal_x_test = create_stack_features(test)# places to store metal models and scoresmetal_models = dict()# no. k-fold splitssplits = 5model = 'ExtraTreesRegressor'opt_models[model] = ExtraTreesRegressor()param_grid = {
'max_depth': range(22, 39, 9), 'n_estimators': range(88, 99, 9),}opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, splits=splits, repeats=1)print(grid_results)cv_score.name = modelscore_models = score_models.append(cv_score)model = 'GradientBoosting'metal_models[model] = GradientBoostingRegressor()param_grid = {
'learning_rate': [0.75], 'max_depth': range(25, 30, 5), 'n_estimators': range(80, 85, 5)}metal_models[model], cv_score, grid_results = train_model(metal_models[model], param_grid=param_grid, X=metal_x_train, y=metal_y_train, splits=splits, repeats=1)print(grid_results)cv_score.name = modelscore_models = score_models.append(cv_score)model = 'RandomForest'metal_models[model] = RandomForestRegressor()param_grid = {
'max_depth': range(22, 35, 9), 'n_estimators': range(66, 77, 9),}metal_models[model], cv_score, grid_results = train_model(metal_models[model], param_grid=param_grid, X=metal_x_train, y=metal_y_train, splits=5, repeats=1)print(grid_results)cv_score.name = modelscore_models = score_models.append(cv_score)model_predict(metal_x_valid, metal_y_valid.tolist(), stack=True)

模型二:分数0.1146

使用了这些模型的stacking
svr, line, lasso, ENet, KRR1, KRR2, lgbm, xgb, nn

# -*- coding: utf-8 -*-# 蒸汽预测import mathimport osfrom datetime import datetimeimport lightgbmimport numpy as npimport pandas as pdimport xgboostfrom keras.layers import Densefrom keras.models import Sequentialfrom keras.wrappers.scikit_learn import KerasRegressorfrom sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clonefrom sklearn.kernel_ridge import KernelRidgefrom sklearn.linear_model import ElasticNet, Lasso, LinearRegressionfrom sklearn.metrics import mean_squared_errorfrom sklearn.model_selection import KFold, cross_val_scorefrom sklearn.pipeline import make_pipelinefrom sklearn.preprocessing import MinMaxScalerfrom sklearn.svm import SVRseed = 2018os.chdir(r'E:\项目文件\工业蒸汽量预测\\')# Stackingclass StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):    def __init__(self, base_models, meta_model, n_folds=5):        self.base_models = base_models        self.meta_model = meta_model        self.n_folds = n_folds    # 我们再次拟合原始模型的克隆数据    def fit(self, X, y):        self.base_models_ = [list() for x in self.base_models]        self.meta_model_ = clone(self.meta_model)        kfold = KFold(n_splits=self.n_folds, shuffle=True)        # 训练克隆的基础模型,然后创建非折叠预测        # 培养克隆元模型所需的        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))        for i, clf in enumerate(self.base_models):            for train_index, holdout_index in kfold.split(X, y):                instance = clone(clf)                self.base_models_[i].append(instance)                instance.fit(X[train_index], y[train_index])                y_pred = instance.predict(X[holdout_index])                out_of_fold_predictions[holdout_index, i] = y_pred        # 现在使用不可折叠的预测来训练克隆的元模型        print(out_of_fold_predictions.shape)        self.meta_model_.fit(out_of_fold_predictions, y)        return self    def predict(self, X):        meta_features = np.column_stack([            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)            for base_models in self.base_models_])        return self.meta_model_.predict(meta_features)# 简单模型融合class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):    def __init__(self, models):        self.models = models    # 遍历所有模型    def fit(self, X, y):        self.models_ = [clone(x) for x in self.models]        for model in self.models_:            model.fit(X, y)        return self    # 预估,并对预估结果值做average    def predict(self, X):        predictions = np.column_stack([            model.predict(X) for model in self.models_        ])        return np.mean(predictions, axis=1)def load_train_data():    df = pd.read_csv("zhengqi_train.txt", header=0, sep="\s+")    X = df.drop(columns=["target"])    y = df["target"]    print("X shape:", X.shape)    print("y shape", y.shape)    return X, ydef load_test_data():    df = pd.read_csv("zhengqi_test.txt", header=0, sep="\s+")    X_test = df    return X_testdef build_nn():    model = Sequential()    model.add(Dense(units=128, activation='linear', input_dim=18))    model.add(Dense(units=32, activation='linear'))    model.add(Dense(units=8, activation='linear'))    model.add(Dense(units=1, activation='linear'))    model.compile(loss='mse', optimizer='adam')    return modeldef build_model():    svr = make_pipeline(SVR(kernel='linear'))    line = make_pipeline(LinearRegression())    lasso = make_pipeline(Lasso(alpha=0.0005, random_state=1))    ENet = make_pipeline(ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))    KRR1 = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)    # KRR1 = LinearSVR(C=2)    KRR2 = KernelRidge(alpha=1.5, kernel='linear', degree=2, coef0=2.5)    lgbm = lightgbm.LGBMRegressor(learning_rate=0.01, n_estimators=500, num_leaves=31)    # lgbm = ExtraTreesRegressor(criterion='mse', n_estimators=500, max_depth=38)    xgb = xgboost.XGBRegressor(booster='gbtree', colsample_bytree=0.8, gamma=0.1,                               learning_rate=0.02, max_depth=5,                               n_estimators=500, min_child_weight=0.8,                               reg_alpha=0, reg_lambda=1, subsample=0.8,                               random_state=seed, nthread=2)    nn = KerasRegressor(build_fn=build_nn, nb_epoch=500, batch_size=32, verbose=2)    return svr, line, lasso, ENet, KRR1, KRR2, lgbm, xgb, nndef rmsle_cv(model=None, X_train_head=None, y_train=None):    n_folds = 5    kf = KFold(n_folds, shuffle=True, random_state=seed).get_n_splits(X_train_head)    rmse = -cross_val_score(model, X_train_head, y_train, scoring="neg_mean_squared_error", cv=kf)    return (rmse)def main():    print("Load data from file......")    X_train, y_train = load_train_data()    X_test = load_test_data()    print("X_train shape", X_train.shape)    print("X_test shape", X_test.shape)    print("y_train shape", y_train.shape)    all_data = pd.concat([X_train, X_test])    print(all_data.shape)    print("Load done.")    all_data = all_data.drop(["V5", "V9", "V11", "V17", "V22", "V28"], axis=1)    print(all_data.shape)    print("Drop done.")    # 标准化    from sklearn import preprocessing    scaler = MinMaxScaler(feature_range=(0, 1))    all_data = pd.DataFrame(scaler.fit_transform(all_data), columns=all_data.columns)    print("Scale done.")    all_data['V0'] = all_data['V0'].apply(lambda x: math.exp(x))    all_data['V1'] = all_data['V1'].apply(lambda x: math.exp(x))    all_data['V6'] = all_data['V6'].apply(lambda x: math.exp(x))    all_data['V7'] = all_data['V7'].apply(lambda x: math.exp(x))    all_data['V8'] = all_data['V8'].apply(lambda x: math.exp(x))    all_data["V30"] = np.log1p(all_data["V30"])    scaled = pd.DataFrame(preprocessing.scale(all_data), columns=all_data.columns)    X_train = scaled.loc[0:len(X_train) - 1]    X_test = scaled.loc[len(X_train):]    # 特征选择    from sklearn.feature_selection import VarianceThreshold    from sklearn.feature_selection import SelectKBest    from sklearn.feature_selection import f_regression    # 方差    threshold = 0.85    vt = VarianceThreshold().fit(X_train)    feat_var_threshold = X_train.columns[vt.variances_ > threshold * (1 - threshold)]    X_train = X_train[feat_var_threshold]    X_test = X_test[feat_var_threshold]    all_data = pd.concat([X_train, X_test])    print("方差后的shape", all_data.shape)    # 获取效果最好的前18个特征    X_scored = SelectKBest(score_func=f_regression, k='all').fit(X_train, y_train)    feature_scoring = pd.DataFrame({
'feature': X_train.columns, 'score': X_scored.scores_ }) head_feature_num = 18 feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature'] X_train_head = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]] X_test_head = X_test[X_test.columns[X_test.columns.isin(feat_scored_headnum)]] print(X_train_head.shape) print(y_train.shape) print(X_test_head.shape) print("Start training......") svr, line, lasso, ENet, KRR1, KRR2, lgbm, xgb, nn = build_model() train_start = datetime.now() score = rmsle_cv(svr, X_train_head, y_train) print("SVR rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std())) svr.fit(X_train_head, y_train) score = rmsle_cv(line, X_train_head, y_train) print("Line rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std())) line.fit(X_train_head, y_train) score = rmsle_cv(lasso, X_train_head, y_train) print("Lasso rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std())) lasso.fit(X_train_head, y_train) score = rmsle_cv(ENet, X_train_head, y_train) print("ElasticNet rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std())) ENet.fit(X_train_head, y_train) # ============================================================================= score = rmsle_cv(KRR1, X_train_head, y_train) print("Kernel Ridge1 rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std())) KRR1.fit(X_train_head, y_train) score = rmsle_cv(KRR2, X_train_head, y_train) print("Kernel Ridge2 rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std())) KRR2.fit(X_train_head, y_train) # ============================================================================= head_feature_num = 22 feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature'] X_train_head3 = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]] score = rmsle_cv(xgb, X_train_head3, y_train) print("Xgboost rmse: {:.4f} 标准差: {:.4f}\n".format(score.mean(), score.std())) xgb.fit(X_train_head, y_train) # ============================================================================= head_feature_num = 22 feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature'] X_train_head4 = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]] score = rmsle_cv(lgbm, X_train_head4, y_train) print("LGBM 得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) lgbm.fit(X_train_head, y_train) # ============================================================================= head_feature_num = 18 feat_scored_headnum = feature_scoring.sort_values('score', ascending=False).head(head_feature_num)['feature'] X_train_head5 = X_train[X_train.columns[X_train.columns.isin(feat_scored_headnum)]] score = rmsle_cv(nn, X_train_head5, y_train) print("NN 得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) nn.fit(X_train_head, y_train) # ============================================================================= averaged_models = AveragingModels(models=(svr, KRR2, lgbm, nn)) score = rmsle_cv(averaged_models, X_train_head, y_train) print("对基模型集成后的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) averaged_models.fit(X_train_head, y_train) stacking_models = StackingAveragedModels(base_models=(svr, KRR2, lgbm, nn), meta_model=xgb) stacking_models.fit(X_train_head.values, y_train.values) stacked_train_pred = stacking_models.predict(X_train_head) score = mean_squared_error(y_train.values, stacked_train_pred) # 0.0718 print("Stacking Averaged models predict score: {:.4f}".format(score)) train_end = datetime.now() print('spend time:' + str((train_end - train_start).seconds) + '(s)') print("Predict......") y_pred = averaged_models.predict(X_test_head) result = pd.DataFrame(y_pred) result.to_csv("result.txt", index=False, header=False) print("Predict Done.") print(datetime.now())main()

模型三:lgb、etr、mlpr的stacking,分数0.1446

#!/usr/bin/env Python# coding=utf-8import osimport warningsfrom sklearn.model_selection import train_test_splitfrom utils.read_write import writeOneCsvfrom tianchi.zhengqi.data_model import get_train, build_model_lgb, build_model_etr, write_mse, \    score_model, get_test, build_model_mlprwarnings.filterwarnings("ignore", "(?s).*MATPLOTLIBDATA.*", category=UserWarning)import pandas as pdfrom sklearn.metrics import mean_squared_erroros.chdir(r'E:\项目文件\工业蒸汽量预测\\')X_data, Y_data = get_train()test = get_test()x_train, x_val, y_train, y_val = train_test_split(X_data, Y_data, test_size=0.01, random_state=0)model_lgb = build_model_lgb(x_train, y_train)val_lgb = model_lgb.predict(x_val)train_lgb_pred = model_lgb.predict(x_train)model_etr = build_model_etr(x_train, y_train)val_etr = model_etr.predict(x_val)train_etr_pred = model_etr.predict(x_train)model_mlp = build_model_mlpr(x_train, y_train)val_mlp = model_mlp.predict(x_val)train_mlp_pred = model_mlp.predict(x_train)print('etr训练集,mse:', mean_squared_error(y_train, train_etr_pred))write_mse('etr', '训练集', mean_squared_error(y_train, train_etr_pred))print('lgb训练集,mse:', mean_squared_error(y_train, train_lgb_pred))write_mse('lgb', '训练集', mean_squared_error(y_train, train_lgb_pred))print('rf训练集,mse:', mean_squared_error(y_train, train_mlp_pred))write_mse('rf', '训练集', mean_squared_error(y_train, train_mlp_pred))Stacking_X_train = pd.DataFrame()Stacking_X_train['Method_1'] = train_mlp_predStacking_X_train['Method_2'] = train_lgb_predStacking_X_train['Method_3'] = train_etr_predStacking_X_val = pd.DataFrame()Stacking_X_val['Method_1'] = val_mlpStacking_X_val['Method_2'] = val_lgbStacking_X_val['Method_3'] = val_etr# 第二层model_Stacking = build_model_etr(Stacking_X_train, y_train)train_pre_Stacking = model_Stacking.predict(Stacking_X_train)score_model(Stacking_X_train, y_train, train_pre_Stacking, model_Stacking, '训练集')# 0.10836028374302602val_pre_Stacking = model_Stacking.predict(Stacking_X_val)score_model(Stacking_X_val, y_val, val_pre_Stacking, model_Stacking, '验证集')subA_etr = model_etr.predict(test)subA_etr1 = model_lgb.predict(test)subA_etr2 = model_mlp.predict(test)Stacking_X_test = pd.DataFrame()Stacking_X_test['Method_1'] = subA_etrStacking_X_test['Method_2'] = subA_etr1Stacking_X_test['Method_3'] = subA_etr2pred = model_Stacking.predict(Stacking_X_test)for i in range(0, pred.shape[0]):    pred[i] = round(pred[i], 7)    writeOneCsv([pred[i]], 'predict_mine.txt')

模型文件data_model.py

import osimport numpy as npfrom sklearn.metrics import mean_absolute_error, mean_squared_errorfrom lightgbm import LGBMRegressorfrom sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressorfrom sklearn.model_selection import GridSearchCVfrom utils.read_write import writeOneCsv, pdReadCsvos.chdir(r'E:\项目文件\工业蒸汽量预测\\')def get_train():    file = 'zhengqi_train.txt'    train = pdReadCsv(file, '\t')    train.drop(["V5", "V9", "V11", "V17", "V22", "V28"], axis=1, inplace=True)    return train.values[:, 0:-1], train.values[:, -1:].ravel()def get_test():    file = 'zhengqi_test.txt'    test = pdReadCsv(file, '\t')    test.drop(["V5", "V9", "V11", "V17", "V22", "V28"], axis=1, inplace=True)    return test.valuesdef build_model_rf(x_train, y_train):    estimator = RandomForestRegressor(criterion='mse')    param_grid = {
'max_depth': range(33, 35, 9), 'n_estimators': range(73, 77, 9), } model = GridSearchCV(estimator, param_grid, cv=3) model.fit(x_train, y_train) print('rf') print(model.best_params_) writeParams('rf', model.best_params_) return modeldef build_model_mlpr(x_train, y_train): from sklearn.neural_network import MLPRegressor '''激活函数用relu,梯度下降方法用lbfgs,效果是最好的''' mlp = MLPRegressor(activation='relu') param_grid = {
'alpha': [0.003], 'hidden_layer_sizes': [(77, 38), (88, 44), (66, 33)], 'max_iter': range(60, 63, 5), 'solver': ['adam', 'sgd'], } model = GridSearchCV(mlp, param_grid, cv=3) model.fit(x_train, y_train.ravel()) print('mlpr') print(model.best_params_) writeParams('mlpr', model.best_params_) return modeldef build_model_etr(x_train, y_train): # 极端随机森林回归 n_estimators 即ExtraTreesRegressor最大的决策树个数 estimator = ExtraTreesRegressor(criterion='mse') param_grid = {
'max_depth': range(31, 33, 9), 'n_estimators': range(99, 111, 9), } model = GridSearchCV(estimator, param_grid) model.fit(x_train, y_train) print('etr') print(model.best_params_) writeParams('etr', model.best_params_) return modeldef build_model_lgb(x_train, y_train): estimator = LGBMRegressor() param_grid = {
'learning_rate': np.arange(0.1, 0.15, 0.05), 'n_estimators': range(111, 122, 9), 'num_leaves': range(33, 44, 9) } gbm = GridSearchCV(estimator, param_grid) gbm.fit(x_train, y_train.ravel()) print('lgb') print(gbm.best_params_) writeParams('lgb', gbm.best_params_) return gbmdef scatter_line(y_val, y_pre): import matplotlib.pyplot as plt xx = range(0, len(y_val)) plt.scatter(xx, y_val, color="red", label="Sample Point", linewidth=3) plt.plot(xx, y_pre, color="orange", label="Fitting Line", linewidth=2) plt.legend() plt.show()def score_model(train, test, predict, model, data_type): score = model.score(train, test) print(data_type + ",R^2,", round(score, 6)) writeOneCsv(['staking', data_type, 'R^2', round(score, 6)], '调参记录.csv') mae = mean_absolute_error(test, predict) print(data_type + ',MAE,', mae) writeOneCsv(['staking', data_type, 'MAE', mae], '调参记录.csv') mse = mean_squared_error(test, predict) print(data_type + ",MSE,", mse) writeOneCsv(['staking', data_type, 'MSE', mse], '调参记录.csv')def writeParams(model, best): if model == 'lgb': writeOneCsv([model, best['num_leaves'], best['n_estimators'], best['learning_rate']], '调参记录.csv') elif model == 'mlpr': writeOneCsv([model, best['hidden_layer_sizes'], best['max_iter'], best['alpha']], '调参记录.csv') else: writeOneCsv([model, best['max_depth'], best['n_estimators'], 0], '调参记录.csv')def write_mse(model, data_type, mse): writeOneCsv([model, data_type, 'mse', mse], '调参记录.csv')

写文件的方法

#   写CSV文件,写一行就换行,追加方式def writeOneCsv(relate_record, src):    try:        with open(src, 'a', newline='\n') as csvFile:            writer = csv.writer(csvFile)            writer.writerow(relate_record)        # csvFile.close()    except Exception as e:        print(e)        print(relate_record)        # writeCsvGBK(relate_record,bus)def pdReadCsv(file, sep):    try:        data = pd.read_csv(file, sep=sep,encoding='utf-8',error_bad_lines=False,engine='python')        return data    except:        data = pd.read_csv(file,sep=sep,encoding='gbk',error_bad_lines=False,engine='python')        return data

喜欢的话欢迎一键三连

转载地址:https://data-mining.blog.csdn.net/article/details/109557396 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:天池新人实战赛o2o优惠券使用预测-排名181
下一篇:阿里云安全恶意程序检测-排名295

发表评论

最新留言

网站不错 人气很旺了 加油
[***.192.178.218]2024年04月20日 05时16分31秒