流程
2022-06-20 20:03:42 4 举报
AI智能生成
408专用
作者其他创作
大纲/内容
分支主题
分支主题
分支主题
确定分类或回归
数据集YX
训练集,验证集划分(test0.33 random42)
选择算法,及评价指标:回归:MSE,分类:准确率
例子
def aggregate_historical_transactions_3month(trans, prefix):
"""
Input:
trans: 用于抽取特征的数据集,如historical_transactions_3month;
prefix: 用于生成特征的前缀;
Return:
agg_trans: 按照card_id汇总后的特征数据集,可以用于与train.csv关联后建模
"""
# 将authorized_flag字段类型转换为数字,”Y“转换为1, ”N“转换为0。
trans['authorized_flag'] = trans['authorized_flag'].apply(lambda x: 1 if x == 'Y' else 0)
# 将category_1字段类型转换为数字,”Y“转换为1, ”N“转换为0。
trans['category_1'] = trans['category_1'].apply(lambda x: 1 if x == 'Y' else 0)
# 将category_2字段中缺失值定义为单独类别,用“6”表示该类
trans['category_2'] = trans['category_2'].fillna(6)
# 将category_3字段中字符映射为数字,缺失值用“3”表示单独一类
map_dict = {'A': 0, 'B': 1, 'C': 2, 'nan': 3}
trans['category_3'] = trans['category_3'].apply(lambda x: map_dict[str(x)])
# 将installmens, category_2, category_3进行独热编码
trans = pd.get_dummies(trans, columns=['installments', 'category_2', 'category_3'])
# 定义agg_func字典
agg_func = {
'authorized_flag': ['sum', 'mean'],
'category_1': ['sum', 'mean'],
'category_2_1.0': ['mean', 'sum'],
'category_2_2.0': ['mean', 'sum'],
'category_2_3.0': ['mean', 'sum'],
'category_2_4.0': ['mean', 'sum'],
'category_2_5.0': ['mean', 'sum'],
'category_2_6.0': ['mean', 'sum'],
'category_3_1': ['sum', 'mean'],
'category_3_2': ['sum', 'mean'],
'category_3_3': ['sum', 'mean'],
'installments_0': ['sum', 'mean'],
'installments_1': ['sum', 'mean'],
'installments_2': ['sum', 'mean'],
'installments_3': ['sum', 'mean'],
'installments_4': ['sum', 'mean'],
'installments_5': ['sum', 'mean'],
'installments_6': ['sum', 'mean'],
'installments_7': ['sum', 'mean'],
'installments_8': ['sum', 'mean'],
'installments_9': ['sum', 'mean'],
'installments_10': ['sum', 'mean'],
'installments_11': ['sum', 'mean'],
'installments_12': ['sum', 'mean'],
'installments_-1': ['sum', 'mean'],
'installments_999': ['sum', 'mean'],
'merchant_id': ['nunique'],
'purchase_amount': ['sum', 'mean', 'max', 'min'],
'merchant_category_id': ['nunique'],
'state_id': ['nunique'],
'subsector_id': ['nunique'],
'city_id': ['nunique']
}
# 基于agg_func,按照card_id进行特征抽取
agg_trans = trans.groupby(['card_id']).agg(agg_func)
# 为新特征增加前缀
agg_trans.columns = [prefix + '_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)
# 按照card_id汇总消费笔数
df = (trans.groupby('card_id')
.size()
.reset_index(name='{}transactions_count'.format(prefix)))
# 将agg_trans数据集与df数据集合并为新的agg_trans数据集,使用card_id作为关联主键
agg_trans = pd.merge(df, agg_trans, on='card_id', how='left')
return agg_trans
# 对historical_transactions_3month数据集执行特征抽取函数
history_3month = aggregate_historical_transactions_3month(historical_transactions_3month, prefix='hist_')
import numpy as np
import pandas as pd
train = pd.read_csv(r"C:\Users\Administrator\Desktop\train.csv")
data[:10]
import pandas as pd
train = pd.read_csv(r"C:\Users\Administrator\Desktop\train.csv")
data[:10]
train.info()
train['feature_1'].value_counts() train['feature_2'].value_counts() train['feature_3'].value_counts()
train = pd.get_dummies(train,columns=['feature_1', 'feature_2'])
Y = train['target']
ts =[ 'feature_3', 'feature_1_1',
'feature_1_2', 'feature_1_3', 'feature_1_4', 'feature_1_5',
'feature_2_1', 'feature_2_2', 'feature_2_3']
X =train[ts]
ts =[ 'feature_3', 'feature_1_1',
'feature_1_2', 'feature_1_3', 'feature_1_4', 'feature_1_5',
'feature_2_1', 'feature_2_2', 'feature_2_3']
X =train[ts]
from sklearn.model_selection import train_test_split
trainX,valX,trainy,valy= train_test_split(X,Y,test_size=0.33,random_state=42)
trainX,valX,trainy,valy= train_test_split(X,Y,test_size=0.33,random_state=42)
from sklearn.linear_model import LinearRegression
IrModel =LinearRegression().fit(trainX,trainy)
IrModel =LinearRegression().fit(trainX,trainy)
from sklearn.metrics import mean_squared_error
mean_squared_error(IrModel.predict(valX),valy)
mean_squared_error(IrModel.predict(valX),valy)
子主题
data =pd.read_csv(r"C:\Users\Administrator\Desktop\train.csv")
data =data[['target','feature_1', 'feature_2', 'feature_3']]
data =data[['target','feature_1', 'feature_2', 'feature_3']]
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(data.corr(),vmin = -1,vmax = 1,square = True,annot =True)
import matplotlib.pyplot as plt
sns.heatmap(data.corr(),vmin = -1,vmax = 1,square = True,annot =True)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(1, 3, figsize = (16, 6))
plt.suptitle('Violineplots for features and target')
sns.violinplot(x="feature_1", y="target", data=data, ax=ax[0], title='feature_1')
sns.violinplot(x="feature_2", y="target", data=data, ax=ax[1], title='feature_2')
sns.violinplot(x="feature_3", y="target", data=data, ax=ax[2], title='feature_3')
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(1, 3, figsize = (16, 6))
plt.suptitle('Violineplots for features and target')
sns.violinplot(x="feature_1", y="target", data=data, ax=ax[0], title='feature_1')
sns.violinplot(x="feature_2", y="target", data=data, ax=ax[1], title='feature_2')
sns.violinplot(x="feature_3", y="target", data=data, ax=ax[2], title='feature_3')
historical_transactions = pd.read_csv(r"C:\Users\Administrator\Desktop\historical_transactions.csv")
historical_transactions[:5]
historical_transactions[:5]
historical_transactions_3month =historical_transactions.loc[historical_transactions['month_lag'] > -4]
historical_transactions_3month.info()
historical_transactions_3month.info()
del historical_transactions
import gc
gc.collect()
import gc
gc.collect()
col_list = ['city_id', 'installments', 'merchant_category_id', 'category_2', 'state_id', 'subsector_id']
for col in col_list:
historical_transactions_3month[col] = historical_transactions_3month[col].astype(object)
historical_transactions_3month[col_list].info()
for col in col_list:
historical_transactions_3month[col] = historical_transactions_3month[col].astype(object)
historical_transactions_3month[col_list].info()
categories = ['authorized_flag', 'city_id', 'installments', 'category_1', 'category_2', 'category_3', 'merchant_category_id', 'merchant_id', 'state_id', 'subsector_id']
historical_transactions_3month[categories].describe()
historical_transactions_3month[categories].describe()
## 参考代码
# 统计分布计算
historical_transactions_3month['purchase_amount'].describe()
# 统计分布计算
historical_transactions_3month['purchase_amount'].describe()
## 参考代码
# 直方图
historical_transactions_3month['purchase_amount'].hist()
# 直方图
historical_transactions_3month['purchase_amount'].hist()
def aggregate_historical_transactions_3month(trans, prefix):
"""
Input:
trans: 用于抽取特征的数据集,如historical_transactions_3month;
prefix: 用于生成特征的前缀;
Return:
agg_trans: 按照card_id汇总后的特征数据集,可以用于与train.csv关联后建模
"""
# 将authorized_flag字段类型转换为数字,”Y“转换为1, ”N“转换为0。
trans['authorized_flag'] = trans['authorized_flag'].apply(lambda x: 1 if x == 'Y' else 0)
# 将category_1字段类型转换为数字,”Y“转换为1, ”N“转换为0。
trans['category_1'] = trans['category_1'].apply(lambda x: 1 if x == 'Y' else 0)
# 将category_2字段中缺失值定义为单独类别,用“6”表示该类
trans['category_2'] = trans['category_2'].fillna(6)
# 将category_3字段中字符映射为数字,缺失值用“3”表示单独一类
map_dict = {'A': 0, 'B': 1, 'C': 2, 'nan': 3}
trans['category_3'] = trans['category_3'].apply(lambda x: map_dict[str(x)])
# 将installmens, category_2, category_3进行独热编码
trans = pd.get_dummies(trans, columns=['installments', 'category_2', 'category_3'])
# 定义agg_func字典
agg_func = {
'authorized_flag': ['sum', 'mean'],
'category_1': ['sum', 'mean'],
'category_2_1.0': ['mean', 'sum'],
'category_2_2.0': ['mean', 'sum'],
'category_2_3.0': ['mean', 'sum'],
'category_2_4.0': ['mean', 'sum'],
'category_2_5.0': ['mean', 'sum'],
'category_2_6.0': ['mean', 'sum'],
'category_3_1': ['sum', 'mean'],
'category_3_2': ['sum', 'mean'],
'category_3_3': ['sum', 'mean'],
'installments_0': ['sum', 'mean'],
'installments_1': ['sum', 'mean'],
'installments_2': ['sum', 'mean'],
'installments_3': ['sum', 'mean'],
'installments_4': ['sum', 'mean'],
'installments_5': ['sum', 'mean'],
'installments_6': ['sum', 'mean'],
'installments_7': ['sum', 'mean'],
'installments_8': ['sum', 'mean'],
'installments_9': ['sum', 'mean'],
'installments_10': ['sum', 'mean'],
'installments_11': ['sum', 'mean'],
'installments_12': ['sum', 'mean'],
'installments_-1': ['sum', 'mean'],
'installments_999': ['sum', 'mean'],
'merchant_id': ['nunique'],
'purchase_amount': ['sum', 'mean', 'max', 'min'],
'merchant_category_id': ['nunique'],
'state_id': ['nunique'],
'subsector_id': ['nunique'],
'city_id': ['nunique']
}
# 基于agg_func,按照card_id进行特征抽取
agg_trans = trans.groupby(['card_id']).agg(agg_func)
# 为新特征增加前缀
agg_trans.columns = [prefix + '_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)
# 按照card_id汇总消费笔数
df = (trans.groupby('card_id')
.size()
.reset_index(name='{}transactions_count'.format(prefix)))
# 将agg_trans数据集与df数据集合并为新的agg_trans数据集,使用card_id作为关联主键
agg_trans = pd.merge(df, agg_trans, on='card_id', how='left')
return agg_trans
# 对historical_transactions_3month数据集执行特征抽取函数
history_3month = aggregate_historical_transactions_3month(historical_transactions_3month, prefix='hist_')
合并新合成的特征数据集至train.csv数据集,得到新的训练数据;
合并过程中,可能存在部分持卡人在近3个月内无消费行为的情况,导致存在缺失值。因此可以将合并后数据集中缺失值填充为0。
合并过程中,可能存在部分持卡人在近3个月内无消费行为的情况,导致存在缺失值。因此可以将合并后数据集中缺失值填充为0。
train_add_history_3month = pd.merge(train, history_3month, on='card_id', how='left')
train_add_history_3month.fillna(0, inplace=True)
train_add_history_3month.info()
Y = train_add_history_3month['target']
feature = [col for col in train_add_history_3month.columns.values if 'hist' in col ]
X = train_add_history_3month[feature]
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, Y, test_size=0.20,random_state=42)
feature = [col for col in train_add_history_3month.columns.values if 'hist' in col ]
X = train_add_history_3month[feature]
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, Y, test_size=0.20,random_state=42)
通过绘制特征重要性分布图,分析模型特征重要性
# #############################################################################
# Plot feature importance
import numpy as np
feature_importance = est.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
# Plot feature importance
import numpy as np
feature_importance = est.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(16, 16))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, train_X.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, train_X.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
保存
train_add_history_3month.to_csv("./data/loyalty/train_add_hist_3month.csv", index=0)
0 条评论
下一页