
大家好啊,我是小愛同學,
先上一個比賽鏈接:
鏈接: link.
這個一個風險用戶識別的比賽,如果大家感興趣的話,可以閱讀本文:
一、賽題理解

比賽提供三個資料表格,分別是用戶基礎資訊,用戶操作行為記錄,用戶交易行為記錄,評價指標是AUC,因此我們可以不考慮該題樣本不均衡對我們的模型產生的影響,因為是用戶信用風險識別,所以時間,金額,地域是我們構造特征的關鍵,
二、資料預處理
1、缺失值處理
因為賽題的特殊性,我們不對缺失值進行常規填充,而是將其作為單獨的一種特征:將類別型特征賦一個‘\N’,數字型賦-1,
下面展示缺失值處理的 代碼片,
# 缺失值處理
cols = ['sex', 'balance_avg', 'balance1_avg', 'provider', 'province', 'city','level']
for col in cols:
data[col].fillna(r'\N', inplace=True)
cols = ['balance_avg','balance1_avg','level']
for col in cols:
data[col].replace({r'\N': -1}, inplace=True)
data[col] = data[col]
# 缺失值處理
cols = ['sex', 'balance_avg', 'balance1_avg', 'provider', 'province', 'city','level']
for col in cols:
data[col].fillna(r'\N', inplace=True)
cols = ['balance_avg','balance1_avg','level']
for col in cols:
data[col].replace({r'\N': -1}, inplace=True)
data[col] = data[col]
2、編碼
(1)無序低基數類別特征(例如性別這樣的):我們用Label Encoder進行編碼
下面展示一些 行內代碼片,
cols = ['sex','provider','verified','regist_type','agreement1','agreement2','agreement3','agreement4','province','city','service3']
for col in cols:
if data[col].dtype == 'object':
data[col] = data[col].astype(str)
labelEncoder_df(data, cols)
print(data.info())
// 無序低基數類別特征
cols = ['sex','provider','verified','regist_type','agreement1','agreement2','agreement3','agreement4','province','city','service3']
for col in cols:
if data[col].dtype == 'object':
data[col] = data[col].astype(str)
labelEncoder_df(data, cols)
print(data.info())
(2)無序高基數類別特征(例如城市,省份這樣的):我們用目標編碼,為減小過擬合現象,采用5折交叉驗證的思路,轉化特征值,見下圖

下面展示一些 行內代碼片,
// A code block
def kfold_stats_feature(train, test, feats, k):
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=44) # 這里最好和后面模型的K折交叉驗證保持一致
train['fold'] = None
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
train.loc[val_idx, 'fold'] = fold_
kfold_features = []
for feat in feats:
nums_columns = ['label']
for f in nums_columns:
colname = feat + '_' + f + '_kfold_mean'
kfold_features.append(colname)
train[colname] = None
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
tmp_trn = train.iloc[trn_idx]
order_label = tmp_trn.groupby([feat])[f].mean()
tmp = train.loc[train.fold == fold_, [feat]]
train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label)
# fillna
global_mean = train[f].mean()
train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
train[colname] = train[colname].astype(float)
for f in nums_columns:
colname = feat + '_' + f + '_kfold_mean'
test[colname] = None
order_label = train.groupby([feat])[f].mean()
test[colname] = test[feat].map(order_label)
# fillna
global_mean = train[f].mean()
test[colname] = test[colname].fillna(global_mean)
test[colname] = test[colname].astype(float)
del train['fold']
return train, test
// 目標編碼
def kfold_stats_feature(train, test, feats, k):
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=44) # 這里最好和后面模型的K折交叉驗證保持一致
train['fold'] = None
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
train.loc[val_idx, 'fold'] = fold_
kfold_features = []
for feat in feats:
nums_columns = ['label']
for f in nums_columns:
colname = feat + '_' + f + '_kfold_mean'
kfold_features.append(colname)
train[colname] = None
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
tmp_trn = train.iloc[trn_idx]
order_label = tmp_trn.groupby([feat])[f].mean()
tmp = train.loc[train.fold == fold_, [feat]]
train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label)
# fillna
global_mean = train[f].mean()
train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
train[colname] = train[colname].astype(float)
for f in nums_columns:
colname = feat + '_' + f + '_kfold_mean'
test[colname] = None
order_label = train.groupby([feat])[f].mean()
test[colname] = test[feat].map(order_label)
# fillna
global_mean = train[f].mean()
test[colname] = test[colname].fillna(global_mean)
test[colname] = test[colname].astype(float)
del train['fold']
return train, test
(3)等級和連續型:轉為整型數值
下面展示一些 行內代碼片,
#轉型別為字符型編碼
cols = ['sex','provider','verified','regist_type','agreement1','agreement2','agreement3','agreement4','province','city','service3']
for col in cols:
if data[col].dtype == 'object':
data[col] = data[col].astype(str)
labelEncoder_df(data, cols)
print(data.info())
//#轉型別為字符型編碼
cols = ['sex','provider','verified','regist_type','agreement1','agreement2','agreement3','agreement4','province','city','service3']
for col in cols:
if data[col].dtype == 'object':
data[col] = data[col].astype(str)
labelEncoder_df(data, cols)
print(data.info())
三、特征工程
1、時間特征分析



對比train和test的操作量的時間分布,可以假定起始時間點相同,.根據正負樣本的每小時交易量分布差異,我們可以放心大膽的構造視窗時間特征,
例如:用戶在星期n的交易金額的統計特征,用戶在交易n天之后的交易金額的統計特征,用戶在每天n點之后的交易金額的統計特征
下面展示一些 行內代碼片,
def gen_user_window_amount_features(df, window):
group_df = df[df['days_diff']>window].groupby('user')['amount'].agg({
'user_amount_mean_{}d'.format(window): 'mean',
'user_amount_std_{}d'.format(window): 'std',
'user_amount_max_{}d'.format(window): 'max',
'user_amount_min_{}d'.format(window): 'min',
'user_amount_sum_{}d'.format(window): 'sum',
'user_amount_med_{}d'.format(window): 'median',
'user_amount_cnt_{}d'.format(window): 'count',
}).reset_index()
return group_df
def gen_user_window_amount_hour_features(df, window):
group_df = df[df['hour']>window].groupby('user')['amount'].agg({
'user_amount_mean_{}h'.format(window): 'mean',
'user_amount_std_{}h'.format(window): 'std',
'user_amount_max_{}h'.format(window): 'max',
'user_amount_min_{}h'.format(window): 'min',
'user_amount_sum_{}h'.format(window): 'sum',
'user_amount_med_{}h'.format(window): 'median',
'user_amount_cnt_{}h'.format(window): 'count',
}).reset_index()
return group_df
def gen_user_window_amount_week_features(df, window):
group_df = df[df['week']==window].groupby('user')['amount'].agg({
'user_amount_mean_{}w'.format(window): 'mean',
'user_amount_std_{}w'.format(window): 'std',
'user_amount_max_{}w'.format(window):'max',
'user_amount_min_{}w'.format(window): 'min',
'user_amount_sum_{}w'.format(window):'sum',
'user_amount_med_{}w'.format(window):'median',
'user_amount_cnt_{}w'.format(window):'count',
}).reset_index()
return group_df
def gen_user_window_amount_features(df, window):
group_df = df[df['days_diff']>window].groupby('user')['amount'].agg({
'user_amount_mean_{}d'.format(window): 'mean',
'user_amount_std_{}d'.format(window): 'std',
'user_amount_max_{}d'.format(window): 'max',
'user_amount_min_{}d'.format(window): 'min',
'user_amount_sum_{}d'.format(window): 'sum',
'user_amount_med_{}d'.format(window): 'median',
'user_amount_cnt_{}d'.format(window): 'count',
}).reset_index()
return group_df
def gen_user_window_amount_hour_features(df, window):
group_df = df[df['hour']>window].groupby('user')['amount'].agg({
'user_amount_mean_{}h'.format(window): 'mean',
'user_amount_std_{}h'.format(window): 'std',
'user_amount_max_{}h'.format(window): 'max',
'user_amount_min_{}h'.format(window): 'min',
'user_amount_sum_{}h'.format(window): 'sum',
'user_amount_med_{}h'.format(window): 'median',
'user_amount_cnt_{}h'.format(window): 'count',
}).reset_index()
return group_df
def gen_user_window_amount_week_features(df, window):
group_df = df[df['week']==window].groupby('user')['amount'].agg({
'user_amount_mean_{}w'.format(window): 'mean',
'user_amount_std_{}w'.format(window): 'std',
'user_amount_max_{}w'.format(window):'max',
'user_amount_min_{}w'.format(window): 'min',
'user_amount_sum_{}w'.format(window):'sum',
'user_amount_med_{}w'.format(window):'median',
'user_amount_cnt_{}w'.format(window):'count',
}).reset_index()
return group_df
2、RFM特征
通過調查資料,我們了解了RFM模型,他是衡量客戶價值和客戶創利能力的重要工具和手段,通過這個資訊我們構造出了很多有用特征,具體見下圖

3、TF-IDF特征
我們對操作模式和操作型別進行提取TF-IDF特征,
下面展示一些 行內代碼片,
def gen_user_tfidf_features(df, value):
df[value] = df[value].astype(str)
df[value].fillna('-1', inplace=True)
group_df = df.groupby(['user']).apply(lambda x: x[value].tolist()).reset_index()#把每個用戶的op_mode轉成串列
group_df.columns = ['user', 'list']
group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))#將op_mode用,連接
enc_vec = TfidfVectorizer()#得到tf-idf矩陣
tfidf_vec = enc_vec.fit_transform(group_df['list'])#得到詞頻矩陣,將op_mode轉為詞向量,即計算機能識別的編碼
svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2020)#降維,提取op_mode的特征,TtuncatedSVD和SVD:TSVD可以選擇需要提取的維度
vec_svd = svd_enc.fit_transform(tfidf_vec)
vec_svd = pd.DataFrame(vec_svd)
vec_svd.columns = ['svd_tfidf_{}_{}'.format(value, i) for i in range(10)]
group_df = pd.concat([group_df, vec_svd], axis=1)
del group_df['list']
return group_df
def gen_user_tfidf_features(df, value):
df[value] = df[value].astype(str)
df[value].fillna('-1', inplace=True)
group_df = df.groupby(['user']).apply(lambda x: x[value].tolist()).reset_index()#把每個用戶的op_mode轉成串列
group_df.columns = ['user', 'list']
group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))#將op_mode用,連接
enc_vec = TfidfVectorizer()#得到tf-idf矩陣
tfidf_vec = enc_vec.fit_transform(group_df['list'])#得到詞頻矩陣,將op_mode轉為詞向量,即計算機能識別的編碼
svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2020)#降維,提取op_mode的特征,TtuncatedSVD和SVD:TSVD可以選擇需要提取的維度
vec_svd = svd_enc.fit_transform(tfidf_vec)
vec_svd = pd.DataFrame(vec_svd)
vec_svd.columns = ['svd_tfidf_{}_{}'.format(value, i) for i in range(10)]
group_df = pd.concat([group_df, vec_svd], axis=1)
del group_df['list']
return group_df
四、模型融合
我們采用了三個模型:LightGBM,Xgboost,Catboost多個引數進行模型融合,
具體模型相關性分析見下圖:


下面展示一些 行內代碼片,
def lgb_model(train, target, test, k):
feats = [f for f in train.columns if f not in ['user', 'label']]
print('Current num of features:', len(feats))
oof_probs = np.zeros(train.shape[0])
output_preds = 0
offline_score = []
feature_importance_df = pd.DataFrame()
parameters = {
'learning_rate': 0.01,
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 68,
'feature_fraction': 0.4,
'bagging_fraction': 0.8,
'min_data_in_leaf': 25,
'verbose': -1,
'nthread': 8,
'max_depth':8
}
seeds = [2020]
for seed in seeds:
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
for i, (train_index, test_index) in enumerate(folds.split(train, target)):
train_y, test_y = target[train_index], target[test_index]
train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]
dtrain = lgb.Dataset(train_X,
label=train_y)
dval = lgb.Dataset(test_X,
label=test_y)
lgb_model = lgb.train(
parameters,
dtrain,
num_boost_round=5000,
valid_sets=[dval],
early_stopping_rounds=200,
verbose_eval=100,
)
oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration)/len(seeds)
offline_score.append(lgb_model.best_score['valid_0']['auc'])
output_preds += lgb_model.predict(test[feats], num_iteration=lgb_model.best_iteration)/folds.n_splits/len(seeds)
print(offline_score)
# feature importance
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = feats
fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
fold_importance_df["fold"] = i + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
print('feature importance:')
print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(310))
feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(457).to_csv('../importance/08_26_452.csv')
return output_preds, oof_probs, np.mean(offline_score)
def lgb_model(train, target, test, k):
feats = [f for f in train.columns if f not in ['user', 'label']]
print('Current num of features:', len(feats))
oof_probs = np.zeros(train.shape[0])
output_preds = 0
offline_score = []
feature_importance_df = pd.DataFrame()
parameters = {
'learning_rate': 0.01,
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 68,
'feature_fraction': 0.4,
'bagging_fraction': 0.8,
'min_data_in_leaf': 25,
'verbose': -1,
'nthread': 8,
'max_depth':8
}
seeds = [2020]
for seed in seeds:
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
for i, (train_index, test_index) in enumerate(folds.split(train, target)):
train_y, test_y = target[train_index], target[test_index]
train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]
dtrain = lgb.Dataset(train_X,
label=train_y)
dval = lgb.Dataset(test_X,
label=test_y)
lgb_model = lgb.train(
parameters,
dtrain,
num_boost_round=5000,
valid_sets=[dval],
early_stopping_rounds=200,
verbose_eval=100,
)
oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration)/len(seeds)
offline_score.append(lgb_model.best_score['valid_0']['auc'])
output_preds += lgb_model.predict(test[feats], num_iteration=lgb_model.best_iteration)/folds.n_splits/len(seeds)
print(offline_score)
# feature importance
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = feats
fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
fold_importance_df["fold"] = i + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
print('feature importance:')
print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(310))
feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(457).to_csv('../importance/08_26_452.csv')
return output_preds, oof_probs, np.mean(offline_score)
五、規則上分
通過分析,我們發現當用戶的余額等級為1,產品金額等級為21時,用戶風險率極大,我們將其結果置為原來的1/2,

具體代碼檔案在這里:https://github.com/poplar1hhh/yipay,希望大家給點個star,記得雙擊么么噠!
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/226182.html
標籤:其他
