本文共 5094 字,大约阅读时间需要 16 分钟。
???????????????????????????????????????????????????????????????
??????????????????
# ?????train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', header=0, index_col=0)test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', header=0, index_col=0) ??????????????????????????
# ?????????train.describe().T
???????
# ????????train.dtypes.value_counts()
??missingno??????????
# ???????mg.matrix(train)
????????????
def missing_percentage(df): total = df.isnull().sum().sort_values(ascending=False)[df.isnull().sum().sort_values(ascending=False) != 0] percentage = round(df.isnull().sum().sort_values(ascending=False)*100 / len(df), 2)[df.isnull().sum().sort_values(ascending=False)*100 / len(df) != 0] return pd.concat([total, percentage], axis=1, keys=['Total', 'Percentage'])missing_percentage(train)
????SalePrice??????
def plot_1(df, feature): style.use('fivethirtyeight') fig, axes = plt.subplots(3, 1, constrained_layout=True, figsize=(10, 24)) # ??????????????? sns.distplot(df.loc[:, feature], norm_hist=True, ax=axes[0]) # ??????? stats.probplot(df.loc[:, feature], plot=axes[1]) # ????? sns.boxplot(df.loc[:, feature], orient='h', ax=axes[2]) plot_1(train, 'SalePrice') ???????????
# ????????????train.SalePrice.skew(), train.SalePrice.kurtosis()
??SalePrice?OverallQual????
def customized_cat_boxplot(y, x): style.use('fivethirtyeight') plt.subplots(figsize=(12, 8)) sns.boxplot(y=y, x=x) customized_cat_boxplot(train.SalePrice, train.OverallQual) ??SalePrice?GrLivArea????
def customized_num_scatterplot(y, x): style.use('fivethirtyeight') plt.subplots(figsize=(12, 8)) sns.scatterplot(y=y, x=x) customized_num_scatterplot(train.SalePrice, train.GrLivArea) ?????????
# ?????train = train[train.GrLivArea < 4500]train.reset_index(drop=True, inplace=True)
??????
plt.subplots(figsize=(12, 8))sns.residplot(train.GrLivArea, train.SalePrice)
?????
# ????train['SalePrice'] = np.log1p(train['SalePrice'])
??????
plot_1(train, 'SalePrice')
??????????????
missing_val_col = ["Alley", "PoolQC", "MiscFeature", "Fence", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']for i in missing_val_col: all_data[i] = all_data[i].fillna('None')missing_val_col2 = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea']for i in missing_val_col2: all_data[i] = all_data[i].fillna(0)# ??????LotFrontageall_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.mean())) ??????
all_data['TotalFS'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']all_data['YrBltAndRemod'] = all_data['YearBuilt'] + all_data['YearRemodAdd']all_data['Total_sqr_footage'] = all_data['BsmtFinSF1'] + all_data['BsmtFinSF2'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']all_data['Total_Bathrooms'] = all_data['FullBath'] + (0.5 * all_data['HalfBath']) + all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath'])all_data['Total_porch_sf'] = all_data['OpenPorchSF'] + all_data['3SsnPorch'] + all_data['EnclosedPorch'] + all_data['ScreenPorch'] + all_data['WoodDeckSF']
???????
all_data['haspool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)all_data['has2ndfloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)all_data['hasgarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)all_data['hasbsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)all_data['hasfireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
????????
def overfit_reducer(df): overfit = [] for i in df.columns: counts = df[i].value_counts() zeros = counts.iloc[0] if zeros / len(df) * 100 > 99.94: overfit.append(i) return overfitoverfitted_features = overfit_reducer(X_train)X_train = X_train.drop(overfitted_features, axis=1)X_test = X_test.drop(overfitted_features, axis=1)
??one-hot???
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)all_data['MSZoning'] = all_data.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))all_data['YrSold'] = all_data['YrSold'].astype(str)all_data['MoSold'] = all_data['MoSold'].astype(str)all_data['Functional'] = all_data['Functional'].fillna('Typ')all_data['Utilities'] = all_data['Utilities'].fillna('AllPub')all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])all_data['KitchenQual'] = all_data['KitchenQual'].fillna("TA")all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])all_data['Electrical'] = all_data['Electrical'].fillna("SBrkr") ?????????????????????????????????????????
转载地址:http://xusv.baihongyu.com/