在jupyter notebook使用GBDT+DeepFM预测PM2.5 出现了错误

数据集大概这样,代码我是网上找的边改边找,实在找解决不了DeepFM模型训练这个问题。希望高手指点指点!

year month day hour AOI CO NO2 O3 PM10 PM2.5 SO2
2023 2 23 0 66 0.57 64 59 81 37 9
2023 2 23 1 61 0.58 72 37 71 30 9
2023 2 23 2 53 0.53 47 52 56 24 8
2023 2 23 3 48 0.48 35 61 48 21 8
2023 2 23 4 45 0.47 30 61 45 20 8

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from deepctr.models import DeepFM

#from deepctr.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr.feature_column import SparseFeat,get_feature_names, DenseFeat
import lightgbm as lgb

读取数据

data = pd.read_csv(‘444.csv’)

定义稀疏特征和密集特征

sparse_features = [‘year’, ‘month’, ‘day’, ‘hour’]
dense_features = [‘AQI’, ‘CO’, ‘NO2’, ‘O3’,’SO2’,’PM10’]

对数据进行预处理

for feat in sparse_features:
data[feat] = data[feat].fillna(‘-1’)

for feat in dense_features:
data[feat] = data[feat].fillna(0)

划分训练集和测试集

train, test = train_test_split(data, test_size=0.2, random_state=2022)

定义特征列

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]

获取特征名

dnn_feature_names = get_feature_names(fixlen_feature_columns)

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

将数据转换成字典形式

train_model_input = {name:train[name] for name in dnn_feature_names}
test_model_input = {name:test[name] for name in dnn_feature_names}

训练GBDT模型

train_gbdt = lgb.Dataset(train[dense_features], label=train[‘PM2.5’])
test_gbdt = lgb.Dataset(test[dense_features], label=test[‘PM2.5’])
params = {
‘objective’: ‘regression’,
‘metric’: ‘rmse’,
‘boosting_type’: ‘gbdt’,
‘num_leaves’: 31,
‘learning_rate’: 0.05,
‘feature_fraction’: 0.9,
‘bagging_fraction’: 0.8,
‘bagging_freq’: 5,
‘verbose’: -1,
‘random_state’: 2022,
}
num_round = 10000
early_stopping_rounds = 100
gbdt_model = lgb.train(params, train_gbdt, num_round, valid_sets=[train_gbdt, test_gbdt], early_stopping_rounds=early_stopping_rounds)

训练DeepFM模型

model = DeepFM(fixlen_feature_columns,dnn_feature_columns, task=’regression’)
model.compile(‘adam’, ‘mse’, metrics=[‘mse’])
history = model.fit(train_model_input, train[‘PM2.5’].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
pred = model.predict(test_model_input, batch_size=256)
deepfm_pred = pred.flatten()

对GBDT和DeepFM模型进行预测并求均方根误差

gbdt_pred = gbdt_model.predict(test[dense_features])
ensemble_pred = gbdt_pred + deepfm_pred
y_true = test[‘PM2.5’]
rmse_gbdt = np.sqrt(mean_squared_error(y_true, gbdt_pred))
rmse_deepfm = np.sqrt(mean_squared_error(y_true, deepfm_pred))
rmse_ensemble = np.sqrt(mean_squared_error(y_true, ensemble_pred))
print(“GBDT RMSE:”, rmse_gbdt)
print(“DeepFM RMSE:”, rmse_deepfm)
print(“Ensemble RMSE:”, rmse_ensemble)

输出特征重要性
importance = gbdt_model.feature_importance()
feature_name = gbdt_model.feature_name()
importance_dict = dict(zip(feature_name, importance))
importance_dict = sorted(importance_dict.items(), key=lambda x:x[1], reverse=True)
for feat, importance in importance_dict:
print(‘{} importance: {}’.format(feat, importance))

讨论数量: 0
(= ̄ω ̄=)··· 暂无内容!

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!