Scribbling

데이터 분석 방법의 기초 - Kaggle 타이타닉 예제 본문

Computer Science/Data Science

데이터 분석 방법의 기초 - Kaggle 타이타닉 예제

focalpoint 2021. 11. 23. 12:11

데이터 분석 방법의 기초, 두번째 포스트

Kaggle 타이타닉 예제를 다룬다.

https://www.kaggle.com/c/titanic

 

1. 데이터 살펴보기

아래 명령어로 pandas display 옵션을 설정 가능하다.

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 50)

데이터 타입

train_df.dtypes

기초 통계량

train_df.describe()

Null 확인

train_df.isnull().sum()

 

2. 개별 특성 분석하기

특성 갯수 세기

train_df['Cabin'].value_counts()

all_df.Pclass.value_counts().plot.bar()

개별 특성과 목적 변수 관계 확인하기

embarked_df = train_df[["Embarked", "Survived", "PassengerId"]].dropna().groupby(["Embarked", "Survived"]).count().unstack()

embarked_df.plot.bar(stacked=True)

embarked_df["survived_rate"] = embarked_df.iloc[:, 1] / (embarked_df.iloc[:, 0] + embarked_df.iloc[:, 1])
embarked_df

plt.hist(x = [train_df.Age[train_df.Survived==0], train_df.Age[train_df.Survived==1]], bins=8, histtype='barstacked', label=["Death", "Survived"])
plt.legend()

상관 계수 행렬

train_corr = train_df_corr.corr()
train_corr

Heatmap

plt.figure(figsize=(9, 9))
sns.heatmap(train_corr, vmax=1, vmin=-1, center=0, annot=True)

특성간의 관계 파악하기

# Heatmap can visualize continuous values (or binary variables) in categories and categories.
plt.subplots(figsize=(10, 5))
g = sns.heatmap(train.pivot_table(index='Pclass', columns='Sex', values='Survived'), annot=True, cmap="YlGnBu")
g.set_title('Survived ratio by Pclass and Sex', weight='bold', size=15)
g.set_xlabel('Sex', weight='bold', size=13)
g.set_ylabel('Pclass', weight='bold', size=13)
plt.show()

수치 데이터 카테고리화 및 분석하기

import copy

tmp_train = copy.deepcopy(train)
tmp_train['AgeBin'] = 6
for i in range(6):
    tmp_train.loc[(tmp_train.Age >= 10*i) & (tmp_train.Age < 10*(i + 1)), 'AgeBin'] = i
tmp_train.head(3)
t0 = pd.pivot_table(index='AgeBin', values='Survived', data=tmp_train).style.background_gradient()
t1 = pd.pivot_table(index='Pclass', columns='AgeBin', values='Survived', data=tmp_train).style.background_gradient()
t2 = pd.crosstab([tmp_train.AgeBin, tmp_train.Pclass], [tmp_train.Sex, tmp_train.Survived],margins=True).style.background_gradient(vmax=100)
t3 = pd.pivot_table(index='Sex', columns='AgeBin', values='Survived', data=tmp_train).style.background_gradient()
multi_table([t2, t0, t1, t3])

 

3. 데이터 전처리

One-Hot Encoding

train_df_corr = pd.get_dummies(train_df, columns=['Sex'], drop_first=True)
train_df_corr = pd.get_dummies(train_df_corr, columns=['Embarked'])
train_df_corr.head()

평균으로 결측치 메우기

Fare_mean = all_df[["Pclass", "Fare"]].groupby("Pclass").mean().reset_index()
Fare_mean.columns = ["Pclass", "Fare_mean"]
Fare_mean

all_df = pd.merge(all_df, Fare_mean, on="Pclass", how="left")
all_df

all_df.loc[(all_df["Fare"].isnull()), "Fare"] = all_df["Fare_mean"]
all_df = all_df.drop("Fare_mean", axis=1)
all_df

String Data 처리하기

all_df["Name"].head(20)

name_df = all_df["Name"].str.split("[,.]", 2, expand=True)
name_df.columns = ["family_name", "honorific", "name"]
name_df

name_df["family_name"] = name_df["family_name"].str.strip()
name_df["honorific"] = name_df["honorific"].str.strip()
name_df["name"] = name_df["name"].str.strip()
name_df['honorific'].value_counts()

Honorific으로 Age 결측치 메우기

all_df = pd.concat([all_df, name_df], axis=1)
plt.figure(figsize=(18, 5))
sns.boxplot(x="honorific", y="Age", data=all_df)

honorific_age_mean = all_df[["honorific", "Age"]].groupby("honorific").mean().reset_index()
honorific_age_mean.columns = ['honorific', 'honorific_Age']

all_df = pd.merge(all_df, honorific_age_mean, on='honorific', how='left')
all_df.loc[(all_df['Age'].isnull()), "Age"] = all_df["honorific_Age"]
all_df = all_df.drop(["honorific_Age"], axis=1)

 

라벨 인코딩

from sklearn.preprocessing import LabelEncoder
all_df['Embarked'].fillna('missing', inplace=True)
all_df.head(5)
le = LabelEncoder()
le = le.fit(all_df['Sex'])
all_df['Sex']= le.transform(all_df["Sex"])
for cat in all_df.columns[all_df.dtypes=='object']:
  print(cat)
  le = LabelEncoder()
  le = le.fit(all_df[cat])
  all_df[cat] = le.transform(all_df[cat])
all_df.head(5)
train_X = all_df[~all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)
train_Y = train_df["Survived"]
test_X = all_df[all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)

 

LightGBM 학습하기

1) Hold Out

import lightgbm as lgb
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_Y, test_size=0.2)
categories = ["Embarked", "Pclass", "Sex", "honorific", "alone"]
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categories)
lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature=categories, reference=lgb_train)
lgbm_params = {
    "objective": "binary",
    "random_seed": 9999,
}
model_lgb = lgb.train(lgbm_params,
                      lgb_train,
                      valid_sets=lgb_eval,
                      num_boost_round=100,
                      early_stopping_rounds=10,
                      verbose_eval=10)
importance = pd.DataFrame(model_lgb.feature_importance(), index=X_train.columns,
                          columns=["importance"]).sort_values(by="importance", ascending=True)
importance.plot.barh()
y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
from sklearn.metrics import accuracy_score
accuracy_score(y_valid, np.round(y_pred))

- Hyper Parameters

lgbm_params = {
    'learning_rate': 0.05,
    'objective': 'binary',
    'max_depth': -1,
    'num_leaves': 31,
    'num_iterations': 500,
    'early_stopping_rounds': 20,
    'verbose_eval': 10,
    'verbose': 1,
}
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categories)
lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature=categories, reference=lgb_train)
model_lgb = lgb.train(lgbm_params,
                      lgb_train,
                      valid_sets=lgb_eval,
                      )
y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
accuracy_score(y_valid, np.round(y_pred))

 

2) K-Fold

from sklearn.model_selection import KFold
folds = 3
kf = KFold(n_splits=folds)

models = []
for train_index, val_index in kf.split(train_X):
  X_train = train_X.iloc[train_index]
  X_valid = train_X.iloc[val_index]
  y_train = train_Y.iloc[train_index]
  y_valid = train_Y.iloc[val_index]

  lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categories)
  lgb_eval = lgb.Dataset(X_valid, y_valid, categorical_feature=categories, reference=lgb_train)

  model_lgb = lgb.train(lgbm_params,
                      lgb_train,
                      valid_sets=lgb_eval,
                      )
  
  y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
  print(accuracy_score(y_valid, np.round(y_pred)))

  models.append(model_lgb)
preds = []

for model in models:
  pred = model.predict(test_X)
  preds.append(pred)

preds_array = np.array(preds)
preds_mean = np.mean(preds_array, axis=0)
preds_int = (preds_mean > 0.5).astype(int)

submission["Survived"] = preds_int
submission.head(20)