Scribbling

Data Science 101 본문

Computer Science/Data Science

Data Science 101

focalpoint 2022. 12. 5. 01:49

Data Science 101

 

1. Data Type and Null Check

all_df.info()

1.1. Nullity 

#missing data
total = all_df.isnull().sum().sort_values(ascending=False)
percent = (all_df.isnull().sum()/all_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data = missing_data.drop(['SalePrice'], axis=0)
missing_data.head(20)

 

1.2. Feature Engineering Example

cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence']
for col in cols:
    all_df.loc[~all_df[col].isnull(), col] = 1
    all_df.loc[all_df[col].isnull(), col] = 0
all_df['hasExpensiveFeature'] = all_df['PoolQC'] + all_df['MiscFeature'] + all_df['Alley']
all_df["hasExpensiveFeature"] = all_df["hasExpensiveFeature"].astype(int)
all_df['hasExpensiveFeature'].value_counts()
all_df = all_df.drop(['PoolQC', 'MiscFeature', 'Alley'], axis=1)

 

1.3. Filling in null values

all_df.loc[all_df.Exterior1st.isnull(), 'Exterior1st'] = all_df.Exterior1st.value_counts().index[0]
all_df.loc[all_df.Exterior2nd.isnull(), 'Exterior2nd'] = all_df.Exterior2nd.value_counts().index[0]

 

 

2. Feature Types

- Numerical

- Nominal: Categorical without order

- Ordinal: Categorical with order

 

2.1. Exploring Numerical Features

2.1.1. Correlations

corrmat = train_df.corr()
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

 

2.1.2. Scatter Plot & Removing Abnormal Data Points

var = 'BsmtFinSF1'
data = pd.concat([all_df['SalePrice'], all_df[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice');

all_df = all_df[all_df.BsmtFinSF1 < 5000]

 

2.1.3. Distribution Plot & Normalizing

- Taking log is useful for skewed feature

f, axs = plt.subplots(1, len(continuous_variables), figsize=(100,5))
for i, cv in enumerate(continuous_variables):
    sns.distplot(all_df[cv], ax=axs[i])
plt.show()
all_df['SalePrice'] = np.log(all_df['SalePrice'])

 

2.1.4. Categorizing Target Value for Visualization

analytic_train_df['SalePriceClass'] = 4
first_quartile = analytic_train_df.SalePrice.describe()['25%']
second_quartile = analytic_train_df.SalePrice.describe()['50%']
third_quartile = analytic_train_df.SalePrice.describe()['75%']
analytic_train_df.loc[analytic_train_df['SalePrice'] < first_quartile, 'SalePriceClass'] = 1
analytic_train_df.loc[(analytic_train_df['SalePrice'] >= first_quartile) &
                    (analytic_train_df['SalePrice'] < second_quartile), 'SalePriceClass'] = 2
analytic_train_df.loc[(analytic_train_df['SalePrice'] >= second_quartile) &
                    (analytic_train_df['SalePrice'] < third_quartile), 'SalePriceClass'] = 3
analytic_train_df
f, ax = plt.subplots(1, 2, figsize=(25, 5))
sns.distplot(analytic_train_df.GrLivArea, ax=ax[0])
sns.histplot(data=analytic_train_df, x='GrLivArea', hue='SalePriceClass', ax=ax[1], element='step')

for i in range(2):
    ax[i].spines['top'].set_visible(False)
    ax[i].spines['right'].set_visible(False)
    ax[i].set_xlabel('GrLivArea', weight='bold', size=15)
    ax[i].set_ylabel('Density', weight='bold', size=15)
    ax[i].set_facecolor('#f6f5f5')
f.suptitle("GrLivArea' distribution", weight='bold', size=20)
plt.show()

def continuous_dist(data, x, y):
    f, ax = plt.subplots(1, 3, figsize=(35, 10))
    sns.violinplot(x=data[x], y=data[y], ax=ax[0], edgecolor='black', linewidth=5)
    sns.boxplot(x=data[x], y=data[y], ax=ax[1])
    sns.stripplot(x=data[x], y=data[y], ax=ax[2])
    for i in range(3):
        ax[i].spines['top'].set_visible(False)
        ax[i].spines['right'].set_visible(False)
        ax[i].set_xlabel(x, weight='bold', size=20)
        ax[i].set_ylabel(y, weight='bold', size=20)
        ax[i].set_facecolor('#f6f5f5')
    f.suptitle(f"{y}'s distribution by {x}", weight='bold', size=25)
    plt.show()
continuous_dist(analytic_train_df, x='SalePriceClass', y='GrLivArea')

 

2.2.1. Handling Nominal Features & Label Encoding

for nv in nominal_variables:
    le = LabelEncoder()
    le = le.fit(all_df[nv])
    all_df[nv] = le.transform(all_df[nv])
    all_df[nv] = all_df[nv].astype('category')

If you need to deal with null values:

for cat in categories:
  le = LabelEncoder()
  all_df[cat].fillna("missing", inplace=True)
  le = le.fit(all_df[cat])
  all_df[cat] = le.transform(all_df[cat])
  all_df[cat] = all_df[cat].astype('category')

 

2.2.2. One-hot Encoding

train_df_corr = pd.get_dummies(train_df, columns=['Sex'], drop_first=True)
train_df_corr = pd.get_dummies(train_df_corr, columns=['Embarked'])

 

 

2.3.1. Handling Ordinal Features & Scatter Plot

all_df = all_df.replace({
                        "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                        "GarageFinish" : {'NA' : 0, 'Unf' : 1, 'RFn' : 2, 'Fin' : 3},
                        "BsmtExposure" : {"NA" : 0, "No" : 1, "Mn" : 2, "Av": 3, "Gd" : 4},
                        "BsmtQual" : {"NA": 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                        "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                        "BsmtFinType2" : {"NA" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                        "BsmtCond" : {"NA" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "GarageQual" : {"NA" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "FireplaceQu" : {"NA" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, 
                                       "Min2" : 6, "Min1" : 7, "Typ" : 8},
                        "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                        "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                        "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "GarageCond" : {"NA" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                        "BsmtFinType1" : {"NA" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                         })
f, axs = plt.subplots(1, len(ordinal_variables), figsize=(100,5))
for i, ov in enumerate(ordinal_variables):
    sns.scatterplot(data=all_df, x=ov, y='SalePrice', ax=axs[i])
plt.show()

 

2.3.2. Visualizing Ordinal Features

def cat_dist(data, var, hue, msg_show=True):
    total_cnt = data[var].count()
    f, ax = plt.subplots(1, 2, figsize=(25, 8))
    hues = [None, hue]
    titles = [f"{var}'s distribution", f"{var}'s distribution by {hue}"]

    for i in range(2):
        sns.countplot(data[var], edgecolor='black', hue=hues[i], linewidth=4, ax=ax[i], data=data)
        ax[i].set_xlabel(var, weight='bold', size=13)
        ax[i].set_ylabel('Count', weight='bold', size=13)
        ax[i].set_facecolor('#f6f5f5')
        ax[i].spines['top'].set_visible(False)
        ax[i].spines['right'].set_visible(False)
        ax[i].set_title(titles[i], size=15, weight='bold')
        for patch in ax[i].patches:
            x, height, width = patch.get_x(), patch.get_height(), patch.get_width()
            if msg_show:
                ax[i].text(x + width / 2, height + 3, f'{height} \n({height / total_cnt * 100:2.2f}%)', va='center', ha='center', size=12, bbox={'facecolor': 'white', 'boxstyle': 'round'})
    plt.show()
cat_dist(analytic_train_df, var='OverallQual', hue='SalePriceClass')

 

3. Prototyping

3.1. Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
LR = LinearRegression()

LR.fit(train_X, train_Y)
mean_squared_error(LR.predict(train_X), train_Y, squared=False)

 

3.2. LightGBM

import lightgbm as lgb
class LGBM:
    def __init__(self):
        self.lgbm_params = {
            'objective': 'regression',
            'random_seed': 9999,
        }
        self.best_loss = None

    def set_data(self, train_X, train_Y):
        self.train_X = train_X
        self.train_Y = train_Y

    def test(self):
        models = []
        rmses = []
        oof = np.zeros(len(self.train_X))

        for train_index, val_index in KFold(n_splits=3).split(self.train_X):
            X_train = self.train_X.iloc[train_index]
            X_valid = self.train_X.iloc[val_index]
            y_train = self.train_Y.iloc[train_index]
            y_valid = self.train_Y.iloc[val_index]

            lgb_train = lgb.Dataset(X_train, y_train)
            lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

            model_lgb = lgb.train(self.lgbm_params,
                                    lgb_train,
                                    valid_sets=lgb_eval,
                                    num_boost_round=100,
                                    early_stopping_rounds=20,
                                    verbose_eval=50,
                                    )
            
            y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
            tmp_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
            
            models.append(model_lgb)
            rmses.append(tmp_rmse)
            oof[val_index] = y_pred
    
        self.current_loss = sum(rmses)/len(rmses)
        print('Current loss: ' + str(self.current_loss) + '(Previous best loss: ' + str(self.best_loss) + ')')
        if self.best_loss == None or self.current_loss < self.best_loss:
            self.best_loss = self.current_loss
        if self.best_loss == self.current_loss:
            self.best_models = models

        actual_pred_df = pd.DataFrame({
            'actual': self.train_Y,
            'pred': oof
        })
        actual_pred_df.plot(figsize=(12, 5))

        lgb.plot_importance(models[0], importance_type="gain", max_num_features=15)
lgbm = LGBM()
lgbm.set_data(train_X, train_Y)
lgbm.test()

 

 

4. Hyper-Parameters Optimization

import optuna
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_Y, test_size=0.2, shuffle=False, stratify=None)
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': 'gbdt',
        'random_seed': 42,

        'num_boost_round': 2500,
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'n_estimators': trial.suggest_int('n_estimators', 5, 50),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-2, 1e-1, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-2, 1e-1, log=True),        
        
    }

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    model_lgb = lgb.train(params,
                          lgb_train,
                          valid_sets=lgb_eval,
                          early_stopping_rounds=50,
                          verbose_eval=100)
    
    y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
    real_y_pred = target_sc.inverse_transform(y_pred[:, np.newaxis])
    real_y_valid = target_sc.inverse_transform(y_valid[:, np.newaxis])

    score = np.sqrt(mean_squared_error(real_y_valid, real_y_pred))
    return score
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=100)
study.best_params
lgbm_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': 'gbdt',
        'random_seed': 42,

        'num_boost_round': 2500,
        'max_depth': 26,
        'n_estimators': 10,
        'num_leaves': 61,
        'learning_rate': 0.08903710966970699,
        'subsample': 0.8987580600850132,
        'colsample_bytree': 0.704071490683633,
        'lambda_l1': 0.041961025695416855,
        'lambda_l2': 0.047169321672552555,
    }
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

model_lgb = lgb.train(lgbm_params,
                        lgb_train,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=50,
                        verbose_eval=100)
    
y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
real_y_pred = target_sc.inverse_transform(y_pred[:, np.newaxis])
real_y_valid = target_sc.inverse_transform(y_valid[:, np.newaxis])

score = np.sqrt(mean_squared_error(real_y_valid, real_y_pred))
print(score)
y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
real_y_pred = target_sc.inverse_transform(y_pred[:, np.newaxis])
real_y_valid = target_sc.inverse_transform(y_valid[:, np.newaxis])

actual_pred_df = pd.DataFrame({
            'actual': np.squeeze(np.exp(real_y_valid)),
            'pred': np.squeeze(np.exp(real_y_pred)),
        })
actual_pred_df.plot(figsize=(30, 10))