我想在管道中對我的資料進行所有自定義轉換。我認為pipe.fit_transform(X)在模型中使用它之前我可以使用它來轉換我的 X,但我也認為我將能夠附加到管道模型本身并簡單地使用它作為一個 using pipe.steps.append(('model', self.model)).
不幸的是,在構建完所有內容之后,我注意到在轉換資料并直接在模型中使用它與在一個管道中執行所有操作時,我得到了不同的結果。有沒有人經歷過這樣的事情?
添加代碼:
# Base pipeline to be used
BASE_PIPE = Pipeline([
('dim_increase_num', data_num_mix()),
('dim_increase_cat', data_cat_mix()),
('start', data_get_dummies()),
('dm_correlation', data_x_corr_()),
('scaler', DFStandardScaler()),
('column_ectraction', ColumnExtractor(columns_catboost)),
])
class base_model_class:
def fit_predict(self, X_train:pd.DataFrame=X_train, y_train:pd.Series=y_train, X_test:pd.DataFrame=X_test):
return self.fit(X_train, y_train).predict(X_test)
def evaluate(self, X:pd.DataFrame=X, y:pd.Series=y):
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
y_pred = self.fit(X_train, y_train).predict(X_test)
result= r2_score(y_test, y_pred)
return result
class model_linear_regression(base_model_class):
def __init__(self, pipe=None, inverse=False):
self.name = 'Linear_Regression'
self.model = LinearRegression()
if pipe==None:
self.pipe = Pipeline([('model', self.model)])
else:
self.pipe = deepcopy(pipe)
self.pipe.steps.append(('model', self.model))
if inverse:
self.pipe = TransformedTargetRegressor( regressor=self.pipe,
func=np.log1p,
inverse_func=np.expm1)
def fit(self, X:pd.DataFrame=X_train, y:pd.Series=y_train):
self.pipe.fit(X, y)
return self
def predict(self, X:pd.DataFrame=X_test):
y_pred = self.pipe.predict(X)
return y_pred
然后,當使用一切時會給出不同的 R2 分數:
Xx=BASE_PIPE.fit_transform(X)
model_linear_regression(inverse=False).evaluate(Xx,y)
>>> 0.7415005607713974
model_linear_regression(BASE_PIPE, inverse=False).evaluate(X,y)
>>> -6.306970505602111e 22
編輯: 提供使用的管道中的所有步驟:
class data_num_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None):
X_ = X.copy()
self.frames = [X_]
for col in self.columns:
A = pd.DataFrame(X_[col].map(lambda x: np.sqrt(x) if x>0 else -np.sqrt(-x)))
A = A.rename(columns={col:col '^s'})
self.frames = [A]
B = pd.DataFrame(X_[col] * X_[col])
B = B.rename(columns={col:col '^2'})
self.frames = [B]
return pd.concat(self.frames, axis=1)
class data_cat_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=CATEGORICAL_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
for col in self.columns:
df_col_count = X_[col].value_counts().to_frame().reset_index()
df_col_count.columns = ["var_name", "var_count"]
df_col_count["var_freq"] = df_col_count["var_count"] / df_col_count["var_count"].sum()
X_['C_' col] = X_[col].replace(df_col_count.set_index('var_name')['var_count'])
X_['F_' col] = X_[col].replace(df_col_count.set_index('var_name')['var_freq'])
return X_
class data_get_dummies(BaseEstimator, TransformerMixin):
def __init__(self, columns:list = CATEGORICAL_FEATURES):
self.columns = columns
self.encoder = make_column_transformer((OneHotEncoder(handle_unknown="ignore", sparse=False), self.columns),remainder='passthrough')
def fit(self, X, y = None):
self.encoder.fit(X)
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
encoder_columns = self.encoder.get_feature_names_out()
fixed_columns = [x.replace('onehotencoder__','').replace('remainder__','') for x in encoder_columns ]
df_temp=pd.DataFrame(self.encoder.transform(X_), columns=fixed_columns)
return df_temp
class data_x_corr(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES_, corr_val:float=0.95):
self.columns = columns
self.corr_val = corr_val
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# prepare numeric df
X_ = X.copy()
x = X_[self.columns]
corr_matrix = x.corr(method='spearman')
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j 1), (i 1):(i 2)]
col = item.columns
row = item.index
val = item.values
if val >= self.corr_val:
drop_cols.append(i)
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i 1):(i 2)].columns.values
X_ = X_.drop(col, axis=1)
return X_
class DFStandardScaler(TransformerMixin):
# StandardScaler but for pandas DataFrames
def __init__(self):
self.ss = None
self.mean_ = None
self.scale_ = None
def fit(self, X, y=None):
self.ss = StandardScaler()
self.ss.fit(X)
self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
return self
def transform(self, X) -> pd.DataFrame:
# assumes X is a DataFrame
Xss = self.ss.transform(X)
Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return Xscaled
def __str__(self):
return "DF_StandardScaler"
def __repr__(self):
return "DF_StandardScaler"
class ColumnExtractor(TransformerMixin, BaseEstimator):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
# assumes X is a DataFrame
Xcols = X[self.cols]
return Xcols
uj5u.com熱心網友回復:
對我來說突出的一個轉換器是data_cat_mix,特別是計數列。當應用于 train test 時,這些是一致的(但會泄漏測驗資訊);當單獨應用時,train 中的值通常會高得多(只是因為它的大小大了三倍),所以模型并不真正理解如何在測驗集中處理它們。
轉載請註明出處,本文鏈接:https://www.uj5u.com/yidong/453459.html
上一篇:使用sklearn同時預測欄位
