머신러닝

[ 머신러닝 ] lightGBM

예진또이(애덤스미스 아님) 2023. 9. 11. 18:14
728x90

1. credit 데이터셋 알아보기

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# csv 파일이 구글 드라이브에 있다는 가정
credit_df = pd.read_csv('/content/drive/MyDrive/8. 머신러닝 딥러닝/credit.csv')
credit_df

# 결과

pd.set_option('display.max_columns', 50)
credit_df.head()

# 결과

credit_df.info()
-------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12500 entries, 0 to 12499
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        12500 non-null  object 
 1   Customer_ID               12500 non-null  object 
 2   Name                      11273 non-null  object 
 3   Age                       12500 non-null  object 
 4   SSN                       12500 non-null  object 
 5   Occupation                12500 non-null  object 
 6   Annual_Income             12500 non-null  object 
 7   Num_Bank_Accounts         12500 non-null  int64  
 8   Num_Credit_Card           12500 non-null  int64  
 9   Interest_Rate             12500 non-null  int64  
 10  Num_of_Loan               12500 non-null  object 
 11  Type_of_Loan              11074 non-null  object 
 12  Delay_from_due_date       12500 non-null  int64  
 13  Num_of_Delayed_Payment    11657 non-null  object 
 14  Num_Credit_Inquiries      12264 non-null  float64
 15  Outstanding_Debt          12500 non-null  object 
 16  Credit_Utilization_Ratio  12500 non-null  float64
 17  Credit_History_Age        11387 non-null  object 
 18  Payment_of_Min_Amount     12500 non-null  object 
 19  Total_EMI_per_month       12500 non-null  float64
 20  Amount_invested_monthly   11935 non-null  object 
 21  Payment_Behaviour         12500 non-null  object 
 22  Monthly_Balance           12366 non-null  float64
 23  Credit_Score              12500 non-null  object 
dtypes: float64(4), int64(4), object(16)
memory usage: 2.3+ MB

# 부가설명

credit_df.drop(['ID', 'Customer_ID', 'Name', 'SSN'], axis=1, inplace=True)
credit_df.info()
----------------------------------------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12500 entries, 0 to 12499
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       12500 non-null  object 
 1   Occupation                12500 non-null  object 
 2   Annual_Income             12500 non-null  object 
 3   Num_Bank_Accounts         12500 non-null  int64  
 4   Num_Credit_Card           12500 non-null  int64  
 5   Interest_Rate             12500 non-null  int64  
 6   Num_of_Loan               12500 non-null  object 
 7   Type_of_Loan              11074 non-null  object 
 8   Delay_from_due_date       12500 non-null  int64  
 9   Num_of_Delayed_Payment    11657 non-null  object 
 10  Num_Credit_Inquiries      12264 non-null  float64
 11  Outstanding_Debt          12500 non-null  object 
 12  Credit_Utilization_Ratio  12500 non-null  float64
 13  Credit_History_Age        11387 non-null  object 
 14  Payment_of_Min_Amount     12500 non-null  object 
 15  Total_EMI_per_month       12500 non-null  float64
 16  Amount_invested_monthly   11935 non-null  object 
 17  Payment_Behaviour         12500 non-null  object 
 18  Monthly_Balance           12366 non-null  float64
 19  Credit_Score              12500 non-null  object 
dtypes: float64(4), int64(4), object(12)
memory usage: 1.9+ MB
credit_df['Credit_Score'].value_counts()
------------------------------------------
# 결과
Standard    6943
Poor        3582
Good        1975
Name: Credit_Score, dtype: int64
credit_df['Credit_Score'] = credit_df['Credit_Score'].replace({'Poor':0, 'Standard':1, 'Good':2})
credit_df.head()

# 결과

credit_df.describe()

# 결과

sns.barplot(x='Payment_of_Min_Amount', y='Credit_Score', data=credit_df)

# 결과

plt.figure(figsize=(20, 5))
sns.barplot(x='Occupation', y='Credit_Score', data=credit_df)

# 결과

# corr(): 각 열 간의 상관 계수를 반환
# 피어슨, 켄달-타우, 스피어먼
plt.figure(figsize=(12, 12))
sns.heatmap(credit_df.corr(), cmap='coolwarm', vmin=-1, vmax=1, annot=True)

# 결과

credit_df.info()
------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12500 entries, 0 to 12499
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       12500 non-null  object 
 1   Occupation                12500 non-null  object 
 2   Annual_Income             12500 non-null  object 
 3   Num_Bank_Accounts         12500 non-null  int64  
 4   Num_Credit_Card           12500 non-null  int64  
 5   Interest_Rate             12500 non-null  int64  
 6   Num_of_Loan               12500 non-null  object 
 7   Type_of_Loan              11074 non-null  object 
 8   Delay_from_due_date       12500 non-null  int64  
 9   Num_of_Delayed_Payment    11657 non-null  object 
 10  Num_Credit_Inquiries      12264 non-null  float64
 11  Outstanding_Debt          12500 non-null  object 
 12  Credit_Utilization_Ratio  12500 non-null  float64
 13  Credit_History_Age        11387 non-null  object 
 14  Payment_of_Min_Amount     12500 non-null  object 
 15  Total_EMI_per_month       12500 non-null  float64
 16  Amount_invested_monthly   11935 non-null  object 
 17  Payment_Behaviour         12500 non-null  object 
 18  Monthly_Balance           12366 non-null  float64
 19  Credit_Score              12500 non-null  int64  
dtypes: float64(4), int64(5), object(11)
memory usage: 1.9+ MB
for i in credit_df.columns:
    if credit_df[i].dtype == 'O':
        print(i)
        
---------------------------------------
# 결과
Age
Occupation
Annual_Income
Num_of_Loan
Type_of_Loan
Num_of_Delayed_Payment
Outstanding_Debt
Credit_History_Age
Payment_of_Min_Amount
Amount_invested_monthly
Payment_Behaviour
-------------------------------------------
credit_df.head()

# 결과

for i in ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment', 'Outstanding_Debt', 'Amount_invested_monthly']:
    credit_df[i] = pd.to_numeric(credit_df[i].str.replace('_', ''))
    
credit_df.info()
---------------------------------------------------------------------------------------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12500 entries, 0 to 12499
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       12500 non-null  int64  
 1   Occupation                12500 non-null  object 
 2   Annual_Income             12500 non-null  float64
 3   Num_Bank_Accounts         12500 non-null  int64  
 4   Num_Credit_Card           12500 non-null  int64  
 5   Interest_Rate             12500 non-null  int64  
 6   Num_of_Loan               12500 non-null  int64  
 7   Type_of_Loan              11074 non-null  object 
 8   Delay_from_due_date       12500 non-null  int64  
 9   Num_of_Delayed_Payment    11657 non-null  float64
 10  Num_Credit_Inquiries      12264 non-null  float64
 11  Outstanding_Debt          12500 non-null  float64
 12  Credit_Utilization_Ratio  12500 non-null  float64
 13  Credit_History_Age        11387 non-null  object 
 14  Payment_of_Min_Amount     12500 non-null  object 
 15  Total_EMI_per_month       12500 non-null  float64
 16  Amount_invested_monthly   11935 non-null  float64
 17  Payment_Behaviour         12500 non-null  object 
 18  Monthly_Balance           12366 non-null  float64
 19  Credit_Score              12500 non-null  int64  
dtypes: float64(8), int64(7), object(5)
memory usage: 1.9+ MB
# Credit_History_Age의 데이터를 개월로 변경
# 22 Years and 1 Months -> 22 * 12 + 1
# 22 Years and 1
credit_df['Credit_History_Age'] = credit_df['Credit_History_Age'].str.replace(' Months', '')
credit_df['Credit_History_Age'] = pd.to_numeric(credit_df['Credit_History_Age'].str.split(' Years and ', expand=True)[0])*12 + pd.to_numeric(credit_df['Credit_History_Age'].str.split(' Years and ', expand=True)[1])

credit_df.head()

# 결과

credit_df.describe()

# 결과

credit_df[credit_df['Age'] < 0]

# 결과

credit_df = credit_df[credit_df['Age'] >= 0]
credit_df.sort_values('Age').tail(30)

# 결과

sns.boxplot(y=credit_df['Age'])

# 결과

credit_df[credit_df['Age'] > 100].sort_values('Age')

# 결과

credit_df = credit_df[credit_df['Age'] < 120]
credit_df.describe()

# 결과

len(credit_df[credit_df['Num_Bank_Accounts'] > 30]) / len(credit_df)
--------------------------------------------------------------------
# 결과
0.013029853207982847

---------------------------------------------------------------------
credit_df = credit_df[credit_df['Num_Bank_Accounts'] <= 10]
credit_df.describe()

# 결과

len(credit_df[credit_df['Num_Credit_Card'] > 10]) / len(credit_df)
--------------------------------------------------------------------
# 결과
0.022142379679144383

--------------------------------------------------------------------
credit_df = credit_df[credit_df['Num_Credit_Card'] <= 10]
credit_df.describe()

# 결과

credit_df = credit_df[credit_df['Interest_Rate'] <= 40]
credit_df.describe()

# 결과

len(credit_df[credit_df['Num_of_Loan'] > 10]) / len(credit_df)
--------------------------------------------------------------
# 결과
0.005310350831374598

-----------------------------------------------------------------------------------------
credit_df = credit_df[(credit_df['Num_of_Loan'] <= 10) & (credit_df['Num_of_Loan'] >= 0)]
credit_df.describe()

# 결과

credit_df = credit_df[credit_df['Delay_from_due_date'] >= 0]
len(credit_df[credit_df['Num_of_Delayed_Payment'] > 40]) / len(credit_df)
---------------------------------------------------------------------------
# 결과
0.007340122947059363

---------------------------------------------------------------------------
credit_df = credit_df[(credit_df['Num_of_Delayed_Payment'] <= 30) & (credit_df['Num_of_Delayed_Payment'] >= 0)]
credit_df.describe()

# 결과

credit_df['Num_Credit_Inquiries'] = credit_df['Num_Credit_Inquiries'].fillna(0)
---------------------------------------------------------------------------------
# 결과
<ipython-input-49-17ca6241ab57>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  credit_df['Num_Credit_Inquiries'] = credit_df['Num_Credit_Inquiries'].fillna(0)
  
---------------------------------------------------------------------------------------------------------------------------------------------
credit_df.info()
---------------------------------------------------------------------------------------------------------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10002 entries, 0 to 12498
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       10002 non-null  int64  
 1   Occupation                10002 non-null  object 
 2   Annual_Income             10002 non-null  float64
 3   Num_Bank_Accounts         10002 non-null  int64  
 4   Num_Credit_Card           10002 non-null  int64  
 5   Interest_Rate             10002 non-null  int64  
 6   Num_of_Loan               10002 non-null  int64  
 7   Type_of_Loan              8893 non-null   object 
 8   Delay_from_due_date       10002 non-null  int64  
 9   Num_of_Delayed_Payment    10002 non-null  float64
 10  Num_Credit_Inquiries      10002 non-null  float64
 11  Outstanding_Debt          10002 non-null  float64
 12  Credit_Utilization_Ratio  10002 non-null  float64
 13  Credit_History_Age        9104 non-null   float64
 14  Payment_of_Min_Amount     10002 non-null  object 
 15  Total_EMI_per_month       10002 non-null  float64
 16  Amount_invested_monthly   9547 non-null   float64
 17  Payment_Behaviour         10002 non-null  object 
 18  Monthly_Balance           9893 non-null   float64
 19  Credit_Score              10002 non-null  int64  
dtypes: float64(9), int64(7), object(4)
memory usage: 1.6+ MB

------------------------------------------------------------------------------------------
credit_df.isna().sum()
------------------------------------------------------------------------------------------
# 결과
Age                            0
Occupation                     0
Annual_Income                  0
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Type_of_Loan                1109
Delay_from_due_date            0
Num_of_Delayed_Payment         0
Num_Credit_Inquiries           0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Credit_History_Age           898
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly      455
Payment_Behaviour              0
Monthly_Balance              109
Credit_Score                   0
dtype: int64
credit_df.head()

# 결과

sns.displot(credit_df['Credit_History_Age'])

# 결과

sns.displot(credit_df['Amount_invested_monthly'])

# 결과

sns.displot(credit_df['Monthly_Balance'])

# 결과

credit_df = credit_df.fillna(credit_df.median())
------------------------------------------------------------------------
# 결과
<ipython-input-56-98962993b203>:1: FutureWarning: The default value of numeric_only in DataFrame.median is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  credit_df = credit_df.fillna(credit_df.median())
credit_df.isna().sum()
--------------------------------------
# 결과
Age                            0
Occupation                     0
Annual_Income                  0
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Type_of_Loan                1109
Delay_from_due_date            0
Num_of_Delayed_Payment         0
Num_Credit_Inquiries           0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Credit_History_Age             0
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly        0
Payment_Behaviour              0
Monthly_Balance                0
Credit_Score                   0
dtype: int64
credit_df.head()

# 결과

credit_df['Type_of_Loan'] = credit_df['Type_of_Loan'].fillna('No Loan')
credit_df.isna().sum()
-----------------------------------------------------------------------
# 결과
Age                         0
Occupation                  0
Annual_Income               0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Type_of_Loan                0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Num_Credit_Inquiries        0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Credit_History_Age          0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Payment_Behaviour           0
Monthly_Balance             0
Credit_Score                0
dtype: int64

-------------------------------------------------------------------------
type_list = set(credit_df['Type_of_Loan'].str.split(', ').sum())
type_list
-------------------------------------------------------------------------
# 결과
{'Auto Loan',
 'Credit-Builder Loan',
 'Debt Consolidation Loan',
 'Home Equity Loan',
 'Mortgage Loan',
 'No Loan',
 'Not Specified',
 'Payday Loan',
 'Personal Loan',
 'Student Loan'}
 -----------------------------------------------------------------------
 for i in type_list:
    credit_df[i] = credit_df['Type_of_Loan'].apply(lambda x: 1 if i in x else 0)
 
 credit_df.head()

# 결과

credit_df.drop('Type_of_Loan', axis=1, inplace=True)
credit_df.info()
-----------------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10002 entries, 0 to 12498
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       10002 non-null  int64  
 1   Occupation                10002 non-null  object 
 2   Annual_Income             10002 non-null  float64
 3   Num_Bank_Accounts         10002 non-null  int64  
 4   Num_Credit_Card           10002 non-null  int64  
 5   Interest_Rate             10002 non-null  int64  
 6   Num_of_Loan               10002 non-null  int64  
 7   Delay_from_due_date       10002 non-null  int64  
 8   Num_of_Delayed_Payment    10002 non-null  float64
 9   Num_Credit_Inquiries      10002 non-null  float64
 10  Outstanding_Debt          10002 non-null  float64
 11  Credit_Utilization_Ratio  10002 non-null  float64
 12  Credit_History_Age        10002 non-null  float64
 13  Payment_of_Min_Amount     10002 non-null  object 
 14  Total_EMI_per_month       10002 non-null  float64
 15  Amount_invested_monthly   10002 non-null  float64
 16  Payment_Behaviour         10002 non-null  object 
 17  Monthly_Balance           10002 non-null  float64
 18  Credit_Score              10002 non-null  int64  
 19  Debt Consolidation Loan   10002 non-null  int64  
 20  Auto Loan                 10002 non-null  int64  
 21  Home Equity Loan          10002 non-null  int64  
 22  Payday Loan               10002 non-null  int64  
 23  Credit-Builder Loan       10002 non-null  int64  
 24  Mortgage Loan             10002 non-null  int64  
 25  Not Specified             10002 non-null  int64  
 26  Student Loan              10002 non-null  int64  
 27  Personal Loan             10002 non-null  int64  
 28  No Loan                   10002 non-null  int64  
dtypes: float64(9), int64(17), object(3)
memory usage: 2.3+ MB
--------------------------------------------------------------------

credit_df['Occupation'].value_counts()
---------------------------------------------------------------------
_______          673
Lawyer           664
Mechanic         646
Scientist        640
Engineer         640
Architect        632
Teacher          624
Developer        621
Entrepreneur     620
Media_Manager    616
Accountant       611
Doctor           608
Musician         607
Journalist       606
Manager          602
Writer           592
Name: Occupation, dtype: int64

--------------------------------------------------------------------------------
credit_df['Occupation'] = credit_df['Occupation'].replace('_______', 'Unknown')
credit_df['Occupation'].value_counts()
--------------------------------------------------------------------------------
# 결과
Unknown          673
Lawyer           664
Mechanic         646
Scientist        640
Engineer         640
Architect        632
Teacher          624
Developer        621
Entrepreneur     620
Media_Manager    616
Accountant       611
Doctor           608
Musician         607
Journalist       606
Manager          602
Writer           592
Name: Occupation, dtype: int64

-----------------------------------------------------------------------------------
credit_df['Payment_of_Min_Amount'].value_counts()
-----------------------------------------------------------------------------------
# 결과
Yes    5315
No     3489
NM     1198
Name: Payment_of_Min_Amount, dtype: int64
------------------------------------------------------------------------------------

credit_df['Payment_Behaviour'].value_counts()
------------------------------------------------------------------------------------
# 결과
Low_spent_Small_value_payments      2505
High_spent_Medium_value_payments    1794
High_spent_Large_value_payments     1453
Low_spent_Medium_value_payments     1376
High_spent_Small_value_payments     1136
Low_spent_Large_value_payments       994
!@9#%8                               744
Name: Payment_Behaviour, dtype: int64
-----------------------------------------------------------------------------------

credit_df['Payment_Behaviour'] = credit_df['Payment_Behaviour'].replace('!@9#%8', 'Unknown')
credit_df['Payment_Behaviour'].value_counts()
--------------------------------------------------------------------------------------------
# 결과
Low_spent_Small_value_payments      2505
High_spent_Medium_value_payments    1794
High_spent_Large_value_payments     1453
Low_spent_Medium_value_payments     1376
High_spent_Small_value_payments     1136
Low_spent_Large_value_payments       994
Unknown                              744
Name: Payment_Behaviour, dtype: int64

------------------------------------------------------------------------------------------------------------
credit_df = pd.get_dummies(credit_df, columns={'Occupation', 'Payment_of_Min_Amount', 'Payment_Behaviour'})
------------------------------------------------------------------------------------------------------------
# 결과
<ipython-input-75-6ba6af3d5be1>:1: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
  credit_df = pd.get_dummies(credit_df, columns={'Occupation', 'Payment_of_Min_Amount', 'Payment_Behaviour'})
  
credit_df.head()

# 결과

from sklearn.model_selection import train_test_split
len(credit_df)
----------------------------------------------------
# 결과
10002

-------------------------------------------------------------------------------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(credit_df.drop('Credit_Score', axis=1), credit_df['Credit_Score'], test_size=0.2, random_state=10)

2. lightGBM(LGBM)

  • 트리기반 학습 알고리즘인 gradient boosting 방식의 프레임워크
  • 의사결정나무, 랜덤포레스트는 균형 트리 분할(level wise) 방식이라면, LGBM은 리프 중심 트리 분할(leaf wise)
  • GBM(Gradient Boosting): 모델1을 통해 y를 예측하고, 모델2에 데이터를 넣어 y를 예측, 모델3에 넣어 y를 예측하는 방식
  • 학습하는데 걸리는 시간이 적음(빠른 속도)
  • 메모리 사용량이 상대적으로 적은편
  • 적은 데이터셋을 사용할 경우 과적합 가능성이 매우 큼(일반적으로 데이터가 10000개 이상은 사용해야 함)
from lightgbm import LGBMClassifier
base_model = LGBMClassifier(random_state=10)
base_model.fit(X_train, y_train)

# 결과

pred1 = base_model.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

accuracy_score(y_test, pred1)
-------------------------------
# 결과
0.7351324337831084

-------------------------------
confusion_matrix(y_test, pred1)
-------------------------------
# 결과
array([[407, 148,  27],
       [145, 896,  91],
       [  3, 116, 168]])
       
print(classification_report(y_test, pred1))
--------------------------------------------------------
# 결과
              precision    recall  f1-score   support

           0       0.73      0.70      0.72       582
           1       0.77      0.79      0.78      1132
           2       0.59      0.59      0.59       287

    accuracy                           0.74      2001
   macro avg       0.70      0.69      0.69      2001
weighted avg       0.73      0.74      0.73      2001

----------------------------------------------------------

proba1 = base_model.predict_proba(X_test)
proba1
----------------------------------------------------------
# 결과
array([[6.98737764e-02, 8.38143193e-01, 9.19830307e-02],
       [8.50755391e-01, 1.49084385e-01, 1.60223719e-04],
       [2.74240729e-03, 9.96809944e-01, 4.47648956e-04],
       ...,
       [8.60948233e-01, 1.38960947e-01, 9.08196360e-05],
       [8.96972732e-01, 1.02822925e-01, 2.04342759e-04],
       [6.65766332e-02, 2.29493501e-01, 7.03929866e-01]])
-----------------------------------------------------------

roc_auc_score(y_test, proba1, multi_class='ovr')
------------------------------------------------------------
# 결과
0.8972566425279517

3. RandomizedSearchCV

  • 분류기를 결정하고 해당 분류기의 최적의 하이퍼 파라미터를 찾기 위한 방법
  • 튜닝하고싶은 파라미터를 지정하여 파라미터 값의 범위를 정하고, n_iter값을 설정하여 Random하게 조합하여 반복 적
# n_estimators: 반복 수행하는 트리으 갯수(기본값: 100), 값을 크게 지정하면 학습시간도 오래걸리며, 과적합이 발생할 수 있음
# max_depth: 트리의 최대깊이(기본값:-1)
# learning_rate: 학습률(기본값:0.1)
params = {
    'n_estimators':[100, 300, 500, 1000],
    'max_depth':[-1, 30, 50, 100],
    'num_leaves':[5, 10, 20, 50],
    'learning_rate':[0.01, 0.05, 0.1, 0.5]
}

------------------------------------------------------------------------------------------------------------------
lgbm = LGBMClassifier(random_state=10)

from sklearn.model_selection import RandomizedSearchCV

rand_lgbm = RandomizedSearchCV(lgbm, params, n_iter=30, random_state=10)

rand_lgbm.fit(X_train, y_train)
---------------------------------------------------------------------------
# 결과

rand_lgbm.cv_results_
----------------------------------------------------------------------
# 결과
{'mean_fit_time': array([ 4.5658648 ,  4.21779943,  0.22191768,  0.90757222,  1.59192586,
         2.26492696,  2.47405806,  0.32317824,  2.41929116,  2.22137256,
         0.77717876,  1.22200518, 11.07079234,  3.06908298,  0.99368539,
         1.11443849,  2.51906099,  4.51780829,  2.53231006,  4.29130096,
         0.2664588 ,  2.75201817,  1.99363923, 22.17760978,  6.7546977 ,
         4.74051032,  2.86759224,  1.2348238 ,  1.23866282,  3.96955628]),
 'std_fit_time': array([1.29296252, 3.6896416 , 0.00681215, 0.01657379, 0.94553769,
        0.9201947 , 0.94495989, 0.00780823, 0.88083762, 0.93799943,
        0.02406204, 0.92760223, 3.72010351, 3.42336473, 0.54657991,
        0.02758457, 0.93683229, 1.09257222, 0.95112682, 1.6993746 ,
        0.01278817, 2.54665735, 1.76883832, 6.74077018, 3.23025037,
        1.16056574, 0.87981611, 0.9346451 , 0.03268804, 1.28097227]),
 'mean_score_time': array([0.19462848, 0.06362696, 0.01226988, 0.06756458, 0.05720692,
        0.13102612, 0.10405111, 0.01214552, 0.15181932, 0.09941339,
        0.01983519, 0.04295268, 0.56661935, 0.07857223, 0.03669739,
        0.06129651, 0.09993787, 0.16018553, 0.16217937, 0.06546946,
        0.01431675, 0.08456359, 0.0371521 , 0.65228119, 0.26577258,
        0.27799759, 0.08685555, 0.03979492, 0.08403063, 0.19596457]),
 'std_score_time': array([0.01524097, 0.02729711, 0.00133985, 0.01082821, 0.0013098 ,
        0.00354922, 0.00516856, 0.00165941, 0.0431668 , 0.00593107,
        0.00348709, 0.00921069, 0.26958653, 0.03775454, 0.0143277 ,
        0.00348675, 0.00236711, 0.00683878, 0.05041467, 0.02363663,
        0.00155732, 0.04569411, 0.0135447 , 0.2214836 , 0.10086701,
        0.12464238, 0.00336158, 0.00205019, 0.00277391, 0.03762538]),
 'param_num_leaves': masked_array(data=[50, 20, 5, 5, 20, 5, 20, 10, 5, 50, 50, 10, 50, 20, 5,
                    20, 50, 50, 5, 50, 10, 5, 5, 50, 10, 50, 50, 10, 10,
                    10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[500, 300, 100, 500, 300, 1000, 500, 100, 1000, 300,
                    100, 300, 1000, 300, 300, 300, 300, 500, 1000, 100,
                    100, 500, 300, 1000, 1000, 1000, 300, 300, 500, 1000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[30, 50, 30, 30, 50, 50, 30, 100, -1, 30, 50, 50, 100,
                    100, 30, 30, 30, 100, 100, 30, 30, 100, 100, 100, 100,
                    -1, 100, 30, 100, -1],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_learning_rate': masked_array(data=[0.05, 0.01, 0.1, 0.1, 0.05, 0.5, 0.01, 0.01, 0.05, 0.5,
                    0.01, 0.5, 0.05, 0.5, 0.1, 0.5, 0.05, 0.01, 0.1, 0.5,
                    0.5, 0.01, 0.5, 0.01, 0.01, 0.5, 0.01, 0.1, 0.1, 0.5],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'num_leaves': 50,
   'n_estimators': 500,
   'max_depth': 30,
   'learning_rate': 0.05},
  {'num_leaves': 20,
   'n_estimators': 300,
   'max_depth': 50,
   'learning_rate': 0.01},
  {'num_leaves': 5,
   'n_estimators': 100,
   'max_depth': 30,
   'learning_rate': 0.1},
  {'num_leaves': 5,
   'n_estimators': 500,
   'max_depth': 30,
   'learning_rate': 0.1},
  {'num_leaves': 20,
   'n_estimators': 300,
   'max_depth': 50,
   'learning_rate': 0.05},
  {'num_leaves': 5,
   'n_estimators': 1000,
   'max_depth': 50,
   'learning_rate': 0.5},
  {'num_leaves': 20,
   'n_estimators': 500,
   'max_depth': 30,
   'learning_rate': 0.01},
  {'num_leaves': 10,
   'n_estimators': 100,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 5,
   'n_estimators': 1000,
   'max_depth': -1,
   'learning_rate': 0.05},
  {'num_leaves': 50,
   'n_estimators': 300,
   'max_depth': 30,
   'learning_rate': 0.5},
  {'num_leaves': 50,
   'n_estimators': 100,
   'max_depth': 50,
   'learning_rate': 0.01},
  {'num_leaves': 10,
   'n_estimators': 300,
   'max_depth': 50,
   'learning_rate': 0.5},
  {'num_leaves': 50,
   'n_estimators': 1000,
   'max_depth': 100,
   'learning_rate': 0.05},
  {'num_leaves': 20,
   'n_estimators': 300,
   'max_depth': 100,
   'learning_rate': 0.5},
  {'num_leaves': 5,
   'n_estimators': 300,
   'max_depth': 30,
   'learning_rate': 0.1},
  {'num_leaves': 20,
   'n_estimators': 300,
   'max_depth': 30,
   'learning_rate': 0.5},
  {'num_leaves': 50,
   'n_estimators': 300,
   'max_depth': 30,
   'learning_rate': 0.05},
  {'num_leaves': 50,
   'n_estimators': 500,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 5,
   'n_estimators': 1000,
   'max_depth': 100,
   'learning_rate': 0.1},
  {'num_leaves': 50,
   'n_estimators': 100,
   'max_depth': 30,
   'learning_rate': 0.5},
  {'num_leaves': 10,
   'n_estimators': 100,
   'max_depth': 30,
   'learning_rate': 0.5},
  {'num_leaves': 5,
   'n_estimators': 500,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 5,
   'n_estimators': 300,
   'max_depth': 100,
   'learning_rate': 0.5},
  {'num_leaves': 50,
   'n_estimators': 1000,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 10,
   'n_estimators': 1000,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 50,
   'n_estimators': 1000,
   'max_depth': -1,
   'learning_rate': 0.5},
  {'num_leaves': 50,
   'n_estimators': 300,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 10,
   'n_estimators': 300,
   'max_depth': 30,
   'learning_rate': 0.1},
  {'num_leaves': 10,
   'n_estimators': 500,
   'max_depth': 100,
   'learning_rate': 0.1},
  {'num_leaves': 10,
   'n_estimators': 1000,
   'max_depth': -1,
   'learning_rate': 0.5}],
 'split0_test_score': array([0.73329169, 0.73016864, 0.7339163 , 0.73891318, 0.73454091,
        0.72517177, 0.72767021, 0.71205497, 0.73766396, 0.73266708,
        0.71767645, 0.71642723, 0.7339163 , 0.72267333, 0.74203623,
        0.72267333, 0.7339163 , 0.72891943, 0.7339163 , 0.72329794,
        0.72829482, 0.7339163 , 0.72392255, 0.73516552, 0.73079325,
        0.72829482, 0.72517177, 0.7270456 , 0.72579638, 0.72517177]),
 'split1_test_score': array([0.725   , 0.741875, 0.754375, 0.740625, 0.7325  , 0.718125,
        0.745625, 0.736875, 0.741875, 0.72125 , 0.735   , 0.728125,
        0.72875 , 0.723125, 0.741875, 0.723125, 0.7275  , 0.738125,
        0.733125, 0.725   , 0.725625, 0.76    , 0.7275  , 0.7325  ,
        0.744375, 0.720625, 0.74    , 0.73625 , 0.72125 , 0.71375 ]),
 'split2_test_score': array([0.738125, 0.7525  , 0.7425  , 0.725   , 0.7275  , 0.726875,
        0.743125, 0.719375, 0.725   , 0.735   , 0.7325  , 0.731875,
        0.736875, 0.73625 , 0.7325  , 0.73625 , 0.734375, 0.74625 ,
        0.730625, 0.73875 , 0.7325  , 0.741875, 0.735   , 0.7375  ,
        0.741875, 0.738125, 0.738125, 0.734375, 0.73875 , 0.72875 ]),
 'split3_test_score': array([0.73125 , 0.7475  , 0.735625, 0.72875 , 0.73    , 0.715625,
        0.745625, 0.726875, 0.73    , 0.7275  , 0.733125, 0.716875,
        0.734375, 0.71125 , 0.73125 , 0.71125 , 0.731875, 0.73625 ,
        0.723125, 0.731875, 0.71875 , 0.73375 , 0.721875, 0.729375,
        0.738125, 0.725   , 0.7475  , 0.72625 , 0.72125 , 0.71375 ]),
 'split4_test_score': array([0.725625, 0.731875, 0.73    , 0.72625 , 0.72625 , 0.715625,
        0.73125 , 0.728125, 0.725   , 0.713125, 0.7325  , 0.72625 ,
        0.72625 , 0.730625, 0.72875 , 0.730625, 0.7225  , 0.730625,
        0.719375, 0.72125 , 0.721875, 0.728125, 0.725625, 0.729375,
        0.740625, 0.718125, 0.735625, 0.726875, 0.720625, 0.71125 ]),
 'mean_test_score': array([0.73065834, 0.74078373, 0.73928326, 0.73190764, 0.73015818,
        0.72028435, 0.73865904, 0.72466099, 0.73190779, 0.72590842,
        0.73016029, 0.72391045, 0.73203326, 0.72478467, 0.73528225,
        0.72478467, 0.73003326, 0.73603389, 0.72803326, 0.72803459,
        0.72540896, 0.73953326, 0.72678451, 0.7327831 , 0.73915865,
        0.72603396, 0.73728435, 0.73015912, 0.72553428, 0.71853435]),
 'std_test_score': array([0.00490687, 0.00866736, 0.00856124, 0.00655385, 0.00306781,
        0.00480428, 0.00765039, 0.00840163, 0.0068052 , 0.00794967,
        0.00630883, 0.00619945, 0.00391658, 0.00843757, 0.00558123,
        0.00843757, 0.00448304, 0.00614264, 0.00576731, 0.00643807,
        0.004806  , 0.01113118, 0.00450876, 0.00320111, 0.004643  ,
        0.00698831, 0.00723753, 0.00425754, 0.00686346, 0.00703216]),
 'rank_test_score': array([13,  1,  3, 12, 16, 29,  5, 27, 11, 22, 14, 28, 10, 25,  8, 25, 17,
         7, 19, 18, 24,  2, 20,  9,  4, 21,  6, 15, 23, 30], dtype=int32)}

# 결과

rand_lgbm.best_params_
--------------------------------------------------------------------------------
# 결과
{'num_leaves': 20, 'n_estimators': 300, 'max_depth': 50, 'learning_rate': 0.01}

---------------------------------------------------------------------------------
lgbm =LGBMClassifier(random_state=10,num_leaves=20,n_estomators=300,max_depth=50,learning_rate=0.01)

lgbm.fit(X_train,y_train)

# 결과

proba = lgbm.predict_proba(X_test)

proba = lgbm.predict_proba(X_test)
roc_auc_score(y_test, proba, multi_class='ovr')
-------------------------------------------------
# 결과
0.9011382504287848
728x90
반응형