在jupyter notebook使用GBDT+DeepFM预测PM2.5 出现了错误
数据集大概这样,代码我是网上找的边改边找,实在找解决不了DeepFM模型训练这个问题。希望高手指点指点!
year | month | day | hour | AOI | CO | NO2 | O3 | PM10 | PM2.5 | SO2 |
---|---|---|---|---|---|---|---|---|---|---|
2023 | 2 | 23 | 0 | 66 | 0.57 | 64 | 59 | 81 | 37 | 9 |
2023 | 2 | 23 | 1 | 61 | 0.58 | 72 | 37 | 71 | 30 | 9 |
2023 | 2 | 23 | 2 | 53 | 0.53 | 47 | 52 | 56 | 24 | 8 |
2023 | 2 | 23 | 3 | 48 | 0.48 | 35 | 61 | 48 | 21 | 8 |
2023 | 2 | 23 | 4 | 45 | 0.47 | 30 | 61 | 45 | 20 | 8 |
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM
#from deepctr.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr.feature_column import SparseFeat,get_feature_names, DenseFeat
import lightgbm as lgb
读取数据
data = pd.read_csv(‘444.csv’)
定义稀疏特征和密集特征
sparse_features = [‘year’, ‘month’, ‘day’, ‘hour’]
dense_features = [‘AQI’, ‘CO’, ‘NO2’, ‘O3’,’SO2’,’PM10’]
对数据进行预处理
for feat in sparse_features:
data[feat] = data[feat].fillna(‘-1’)
for feat in dense_features:
data[feat] = data[feat].fillna(0)
划分训练集和测试集
train, test = train_test_split(data, test_size=0.2, random_state=2022)
定义特征列
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
获取特征名
dnn_feature_names = get_feature_names(fixlen_feature_columns)
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
将数据转换成字典形式
train_model_input = {name:train[name] for name in dnn_feature_names}
test_model_input = {name:test[name] for name in dnn_feature_names}
训练GBDT模型
train_gbdt = lgb.Dataset(train[dense_features], label=train[‘PM2.5’])
test_gbdt = lgb.Dataset(test[dense_features], label=test[‘PM2.5’])
params = {
‘objective’: ‘regression’,
‘metric’: ‘rmse’,
‘boosting_type’: ‘gbdt’,
‘num_leaves’: 31,
‘learning_rate’: 0.05,
‘feature_fraction’: 0.9,
‘bagging_fraction’: 0.8,
‘bagging_freq’: 5,
‘verbose’: -1,
‘random_state’: 2022,
}
num_round = 10000
early_stopping_rounds = 100
gbdt_model = lgb.train(params, train_gbdt, num_round, valid_sets=[train_gbdt, test_gbdt], early_stopping_rounds=early_stopping_rounds)
训练DeepFM模型
model = DeepFM(fixlen_feature_columns,dnn_feature_columns, task=’regression’)
model.compile(‘adam’, ‘mse’, metrics=[‘mse’])
history = model.fit(train_model_input, train[‘PM2.5’].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
pred = model.predict(test_model_input, batch_size=256)
deepfm_pred = pred.flatten()
对GBDT和DeepFM模型进行预测并求均方根误差
gbdt_pred = gbdt_model.predict(test[dense_features])
ensemble_pred = gbdt_pred + deepfm_pred
y_true = test[‘PM2.5’]
rmse_gbdt = np.sqrt(mean_squared_error(y_true, gbdt_pred))
rmse_deepfm = np.sqrt(mean_squared_error(y_true, deepfm_pred))
rmse_ensemble = np.sqrt(mean_squared_error(y_true, ensemble_pred))
print(“GBDT RMSE:”, rmse_gbdt)
print(“DeepFM RMSE:”, rmse_deepfm)
print(“Ensemble RMSE:”, rmse_ensemble)
输出特征重要性
importance = gbdt_model.feature_importance()
feature_name = gbdt_model.feature_name()
importance_dict = dict(zip(feature_name, importance))
importance_dict = sorted(importance_dict.items(), key=lambda x:x[1], reverse=True)
for feat, importance in importance_dict:
print(‘{} importance: {}’.format(feat, importance))