Pima Indians Diabetes Prediction
Variable | Definition |
---|---|
Pregnancies | 임신 횟수 |
Glucose | 포도당 부하 검사 수치 |
BloodPressure | 혈압 |
SkinThickness | 팔 삼두근 뒤쪽의 피하지방 측정값(mm) |
Inlulin | 혈청 인슐린(mu U/ml) |
BMI | 체질량지수$(\frac{kg}{m^2})$ |
DiabetesPredigreeFunction | 당뇨 내력 가중치 값 |
Age | 나이 |
Outcome | 클래스 결정 값(0 또는 1) |
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Binarizer
import warnings
warnings.filterwarnings(action='ignore')
diabetes_data = pd.read_csv('diabetes.csv')
diabetes_data.head()
diabetes_data.Outcome.value_counts()
diabetes_data.info()
- Glucose, BloodPressure, SkinThickness, Insulin, Bmi은 0이면 안된다.
zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for i, feature in enumerate(zero_features):
plt.subplot(3,2,i+1)
plt.hist(diabetes_data[feature])
diabetes_data[diabetes_data['Glucose'] == 0]['Glucose'].count()
for feature in zero_features:
zero_count = diabetes_data[diabetes_data[feature] == 0][feature].count()
print('{0} 0 건수는 {1}, 퍼센트는 {2:.2f}%'.format(feature, zero_count,
100*zero_count / diabetes_data[feature].count()))
SkinThickness, Insulin feature가 0인 행을 지우면 데이터 손실이 너무 크므로 평균값으로 대체한다.
mean_zero_features = diabetes_data[zero_features].mean()
diabetes_data[zero_features] = diabetes_data[zero_features].replace(0, mean_zero_features)
def precision_recall_curve_plot(y_test, pred_proba_c1):
precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_c1)
plt.figure(figsize=(8,6))
threshold_boundary = thresholds.shape[0]
plt.plot(thresholds, precisions[0:threshold_boundary], linestyle = '-', label = 'precision')
plt.plot(thresholds, recalls[0:threshold_boundary], label = 'recall')
start, end = plt.xlim()
plt.xticks(np.round(np.arange(start, end, 0.1), 2))
plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
plt.legend(); plt.grid()
plt.show()
def get_clf_eval(y_test, pred=None, pred_proba=None): # 모델 평가 함수
confusion = confusion_matrix(y_test, pred)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
roc_auc = roc_auc_score(y_test, pred_proba)
print('오차 행렬')
print(confusion)
print('정확도 : {0:.3f}, 정밀도 : {1:.3f}, 재현율 : {2:.3f}, F1 : {3:.3f}, AUC : {4:.3f}'.format( accuracy, precision, recall, f1, roc_auc))
def get_eval_by_threshold(y_test, pred_proba_c1, thresholds):
for custom_threshold in thresholds:
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1)
custom_predict = binarizer.transform(pred_proba_c1)
print('-------------------------------------------------------------------------')
print('임계값:', round(custom_threshold,2))
get_clf_eval(y_test, custom_predict, pred_proba_c1)
feature_name = diabetes_data.columns[:-1]
target_name = diabetes_data.columns[-1]
X = diabetes_data.loc[:, feature_name]
y = diabetes_data.loc[:, target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 156, stratify=y)
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, pred, pred_proba)
pred_proba_c1 = lr_clf.predict_proba(X_test)[:, 1]
precision_recall_curve_plot(y_test, pred_proba_c1)
thresholds = np.arange(0.3, 0.5, 0.03)
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:, 1].reshape(-1,1), thresholds)
binarizer = Binarizer(threshold=0.48)
pred_th_048 = binarizer.fit_transform(pred_proba[:, 1].reshape(-1,1))
get_clf_eval(y_test, pred_th_048, pred_proba[:, 1])