分类数据权重测算模型
# 夜阑专用捏
# 开发时间:2024/2/22 10:52}
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
df = pd.read_excel(r'C:\Users\86191\Desktop\龙圆芝\薪酬报告推文\薪酬多维数据.xlsx')
# 特征选择
features = df[['层级', '工作年限', '部门', '区域', '性质']]
# 针对部门进行类别合并
df['部门'] = df['部门'].apply(lambda x: '其他' if x not in ['财务融资', '采购部门', '成本管理', '工程设计', '管理层', '品牌宣传', '投资拓展', '行业研究', '行政管理', '运营服务', '招商营销'] else x)
# 独热编码
features_encoded = pd.get_dummies(features)
# 因变量
target = df['薪资区间'] # 年薪类别是你的分类因变量
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)
# 创建随机森林分类器
model = RandomForestClassifier(n_estimators=100, random_state=42)
# 训练模型
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
# 输出准确率和分类报告
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
# 输出特征的重要性
feature_importance = pd.DataFrame({'Feature': features_encoded.columns, 'Importance': model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print('Feature Importance:')
print(feature_importance)
# 提取维度名称
dimensions = ['层级', '工作年限', '部门', '区域', '性质']
# 初始化每个维度的总重要性
total_importance = {dimension: 0.0 for dimension in dimensions}
# 统计每个维度下各个类别的平均重要性
for dimension in dimensions:
dimension_columns = [col for col in feature_importance['Feature'] if dimension in col]
total_importance[dimension] = feature_importance.loc[feature_importance['Feature'].isin(dimension_columns), 'Importance'].mean()
# 输出每个维度的平均重要性
print('Average Importance for Each Dimension:')
for dimension, importance in total_importance.items():
print(f'{dimension}: {importance}')
# 计算总和
total_importance_sum = sum(total_importance.values())
# 归一化每个维度的重要性
normalized_importance = {dimension: importance / total_importance_sum for dimension, importance in total_importance.items()}
# 输出每个维度的归一化重要性百分比
print('\nNormalized Importance for Each Dimension:')
for dimension, importance in normalized_importance.items():
print(f'{dimension}: {importance:.2%}')
版权声明:
作者:夜阑
链接:http://yelan.xyz/index.php/2024/02/23/%e5%88%86%e7%b1%bb%e6%95%b0%e6%8d%ae%e6%9d%83%e9%87%8d%e6%b5%8b%e7%ae%97%e6%a8%a1%e5%9e%8b/
来源:夜阑的小站
文章版权归作者所有,未经允许请勿转载。
THE END
二维码
共有 0 条评论