머신러닝

[ 머신러닝 ] lightGBM

예진또이(애덤스미스 아님) 2023. 9. 11. 18:14

728x90

1. credit 데이터셋 알아보기

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# csv 파일이 구글 드라이브에 있다는 가정
credit_df = pd.read_csv('/content/drive/MyDrive/8. 머신러닝 딥러닝/credit.csv')

credit_df

# 결과

pd.set_option('display.max_columns', 50)
credit_df.head()

# 결과

credit_df.info()
-------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12500 entries, 0 to 12499
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        12500 non-null  object 
 1   Customer_ID               12500 non-null  object 
 2   Name                      11273 non-null  object 
 3   Age                       12500 non-null  object 
 4   SSN                       12500 non-null  object 
 5   Occupation                12500 non-null  object 
 6   Annual_Income             12500 non-null  object 
 7   Num_Bank_Accounts         12500 non-null  int64  
 8   Num_Credit_Card           12500 non-null  int64  
 9   Interest_Rate             12500 non-null  int64  
 10  Num_of_Loan               12500 non-null  object 
 11  Type_of_Loan              11074 non-null  object 
 12  Delay_from_due_date       12500 non-null  int64  
 13  Num_of_Delayed_Payment    11657 non-null  object 
 14  Num_Credit_Inquiries      12264 non-null  float64
 15  Outstanding_Debt          12500 non-null  object 
 16  Credit_Utilization_Ratio  12500 non-null  float64
 17  Credit_History_Age        11387 non-null  object 
 18  Payment_of_Min_Amount     12500 non-null  object 
 19  Total_EMI_per_month       12500 non-null  float64
 20  Amount_invested_monthly   11935 non-null  object 
 21  Payment_Behaviour         12500 non-null  object 
 22  Monthly_Balance           12366 non-null  float64
 23  Credit_Score              12500 non-null  object 
dtypes: float64(4), int64(4), object(16)
memory usage: 2.3+ MB

# 부가설명

credit_df.drop(['ID', 'Customer_ID', 'Name', 'SSN'], axis=1, inplace=True)
credit_df.info()
----------------------------------------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12500 entries, 0 to 12499
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       12500 non-null  object 
 1   Occupation                12500 non-null  object 
 2   Annual_Income             12500 non-null  object 
 3   Num_Bank_Accounts         12500 non-null  int64  
 4   Num_Credit_Card           12500 non-null  int64  
 5   Interest_Rate             12500 non-null  int64  
 6   Num_of_Loan               12500 non-null  object 
 7   Type_of_Loan              11074 non-null  object 
 8   Delay_from_due_date       12500 non-null  int64  
 9   Num_of_Delayed_Payment    11657 non-null  object 
 10  Num_Credit_Inquiries      12264 non-null  float64
 11  Outstanding_Debt          12500 non-null  object 
 12  Credit_Utilization_Ratio  12500 non-null  float64
 13  Credit_History_Age        11387 non-null  object 
 14  Payment_of_Min_Amount     12500 non-null  object 
 15  Total_EMI_per_month       12500 non-null  float64
 16  Amount_invested_monthly   11935 non-null  object 
 17  Payment_Behaviour         12500 non-null  object 
 18  Monthly_Balance           12366 non-null  float64
 19  Credit_Score              12500 non-null  object 
dtypes: float64(4), int64(4), object(12)
memory usage: 1.9+ MB

credit_df['Credit_Score'].value_counts()
------------------------------------------
# 결과
Standard    6943
Poor        3582
Good        1975
Name: Credit_Score, dtype: int64

credit_df['Credit_Score'] = credit_df['Credit_Score'].replace({'Poor':0, 'Standard':1, 'Good':2})
credit_df.head()

# 결과

credit_df.describe()

# 결과

sns.barplot(x='Payment_of_Min_Amount', y='Credit_Score', data=credit_df)

# 결과

plt.figure(figsize=(20, 5))
sns.barplot(x='Occupation', y='Credit_Score', data=credit_df)

# 결과

# corr(): 각 열 간의 상관 계수를 반환
# 피어슨, 켄달-타우, 스피어먼
plt.figure(figsize=(12, 12))
sns.heatmap(credit_df.corr(), cmap='coolwarm', vmin=-1, vmax=1, annot=True)

# 결과

credit_df.info()
------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12500 entries, 0 to 12499
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       12500 non-null  object 
 1   Occupation                12500 non-null  object 
 2   Annual_Income             12500 non-null  object 
 3   Num_Bank_Accounts         12500 non-null  int64  
 4   Num_Credit_Card           12500 non-null  int64  
 5   Interest_Rate             12500 non-null  int64  
 6   Num_of_Loan               12500 non-null  object 
 7   Type_of_Loan              11074 non-null  object 
 8   Delay_from_due_date       12500 non-null  int64  
 9   Num_of_Delayed_Payment    11657 non-null  object 
 10  Num_Credit_Inquiries      12264 non-null  float64
 11  Outstanding_Debt          12500 non-null  object 
 12  Credit_Utilization_Ratio  12500 non-null  float64
 13  Credit_History_Age        11387 non-null  object 
 14  Payment_of_Min_Amount     12500 non-null  object 
 15  Total_EMI_per_month       12500 non-null  float64
 16  Amount_invested_monthly   11935 non-null  object 
 17  Payment_Behaviour         12500 non-null  object 
 18  Monthly_Balance           12366 non-null  float64
 19  Credit_Score              12500 non-null  int64  
dtypes: float64(4), int64(5), object(11)
memory usage: 1.9+ MB

for i in credit_df.columns:
    if credit_df[i].dtype == 'O':
        print(i)
        
---------------------------------------
# 결과
Age
Occupation
Annual_Income
Num_of_Loan
Type_of_Loan
Num_of_Delayed_Payment
Outstanding_Debt
Credit_History_Age
Payment_of_Min_Amount
Amount_invested_monthly
Payment_Behaviour
-------------------------------------------
credit_df.head()

# 결과

for i in ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment', 'Outstanding_Debt', 'Amount_invested_monthly']:
    credit_df[i] = pd.to_numeric(credit_df[i].str.replace('_', ''))
    
credit_df.info()
---------------------------------------------------------------------------------------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12500 entries, 0 to 12499
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       12500 non-null  int64  
 1   Occupation                12500 non-null  object 
 2   Annual_Income             12500 non-null  float64
 3   Num_Bank_Accounts         12500 non-null  int64  
 4   Num_Credit_Card           12500 non-null  int64  
 5   Interest_Rate             12500 non-null  int64  
 6   Num_of_Loan               12500 non-null  int64  
 7   Type_of_Loan              11074 non-null  object 
 8   Delay_from_due_date       12500 non-null  int64  
 9   Num_of_Delayed_Payment    11657 non-null  float64
 10  Num_Credit_Inquiries      12264 non-null  float64
 11  Outstanding_Debt          12500 non-null  float64
 12  Credit_Utilization_Ratio  12500 non-null  float64
 13  Credit_History_Age        11387 non-null  object 
 14  Payment_of_Min_Amount     12500 non-null  object 
 15  Total_EMI_per_month       12500 non-null  float64
 16  Amount_invested_monthly   11935 non-null  float64
 17  Payment_Behaviour         12500 non-null  object 
 18  Monthly_Balance           12366 non-null  float64
 19  Credit_Score              12500 non-null  int64  
dtypes: float64(8), int64(7), object(5)
memory usage: 1.9+ MB

# Credit_History_Age의 데이터를 개월로 변경
# 22 Years and 1 Months -> 22 * 12 + 1
# 22 Years and 1
credit_df['Credit_History_Age'] = credit_df['Credit_History_Age'].str.replace(' Months', '')
credit_df['Credit_History_Age'] = pd.to_numeric(credit_df['Credit_History_Age'].str.split(' Years and ', expand=True)[0])*12 + pd.to_numeric(credit_df['Credit_History_Age'].str.split(' Years and ', expand=True)[1])

credit_df.head()

# 결과

credit_df.describe()

# 결과

credit_df[credit_df['Age'] < 0]

# 결과

credit_df = credit_df[credit_df['Age'] >= 0]
credit_df.sort_values('Age').tail(30)

# 결과

sns.boxplot(y=credit_df['Age'])

# 결과

credit_df[credit_df['Age'] > 100].sort_values('Age')

# 결과

credit_df = credit_df[credit_df['Age'] < 120]
credit_df.describe()

# 결과

len(credit_df[credit_df['Num_Bank_Accounts'] > 30]) / len(credit_df)
--------------------------------------------------------------------
# 결과
0.013029853207982847

---------------------------------------------------------------------
credit_df = credit_df[credit_df['Num_Bank_Accounts'] <= 10]
credit_df.describe()

# 결과

len(credit_df[credit_df['Num_Credit_Card'] > 10]) / len(credit_df)
--------------------------------------------------------------------
# 결과
0.022142379679144383

--------------------------------------------------------------------
credit_df = credit_df[credit_df['Num_Credit_Card'] <= 10]
credit_df.describe()

# 결과

credit_df = credit_df[credit_df['Interest_Rate'] <= 40]
credit_df.describe()

# 결과

len(credit_df[credit_df['Num_of_Loan'] > 10]) / len(credit_df)
--------------------------------------------------------------
# 결과
0.005310350831374598

-----------------------------------------------------------------------------------------
credit_df = credit_df[(credit_df['Num_of_Loan'] <= 10) & (credit_df['Num_of_Loan'] >= 0)]
credit_df.describe()

# 결과

credit_df = credit_df[credit_df['Delay_from_due_date'] >= 0]
len(credit_df[credit_df['Num_of_Delayed_Payment'] > 40]) / len(credit_df)
---------------------------------------------------------------------------
# 결과
0.007340122947059363

---------------------------------------------------------------------------
credit_df = credit_df[(credit_df['Num_of_Delayed_Payment'] <= 30) & (credit_df['Num_of_Delayed_Payment'] >= 0)]
credit_df.describe()

# 결과

credit_df['Num_Credit_Inquiries'] = credit_df['Num_Credit_Inquiries'].fillna(0)
---------------------------------------------------------------------------------
# 결과
<ipython-input-49-17ca6241ab57>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  credit_df['Num_Credit_Inquiries'] = credit_df['Num_Credit_Inquiries'].fillna(0)
  
---------------------------------------------------------------------------------------------------------------------------------------------
credit_df.info()
---------------------------------------------------------------------------------------------------------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10002 entries, 0 to 12498
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       10002 non-null  int64  
 1   Occupation                10002 non-null  object 
 2   Annual_Income             10002 non-null  float64
 3   Num_Bank_Accounts         10002 non-null  int64  
 4   Num_Credit_Card           10002 non-null  int64  
 5   Interest_Rate             10002 non-null  int64  
 6   Num_of_Loan               10002 non-null  int64  
 7   Type_of_Loan              8893 non-null   object 
 8   Delay_from_due_date       10002 non-null  int64  
 9   Num_of_Delayed_Payment    10002 non-null  float64
 10  Num_Credit_Inquiries      10002 non-null  float64
 11  Outstanding_Debt          10002 non-null  float64
 12  Credit_Utilization_Ratio  10002 non-null  float64
 13  Credit_History_Age        9104 non-null   float64
 14  Payment_of_Min_Amount     10002 non-null  object 
 15  Total_EMI_per_month       10002 non-null  float64
 16  Amount_invested_monthly   9547 non-null   float64
 17  Payment_Behaviour         10002 non-null  object 
 18  Monthly_Balance           9893 non-null   float64
 19  Credit_Score              10002 non-null  int64  
dtypes: float64(9), int64(7), object(4)
memory usage: 1.6+ MB

------------------------------------------------------------------------------------------
credit_df.isna().sum()
------------------------------------------------------------------------------------------
# 결과
Age                            0
Occupation                     0
Annual_Income                  0
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Type_of_Loan                1109
Delay_from_due_date            0
Num_of_Delayed_Payment         0
Num_Credit_Inquiries           0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Credit_History_Age           898
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly      455
Payment_Behaviour              0
Monthly_Balance              109
Credit_Score                   0
dtype: int64

credit_df.head()

# 결과

sns.displot(credit_df['Credit_History_Age'])

# 결과

sns.displot(credit_df['Amount_invested_monthly'])

# 결과

sns.displot(credit_df['Monthly_Balance'])

# 결과

credit_df = credit_df.fillna(credit_df.median())
------------------------------------------------------------------------
# 결과
<ipython-input-56-98962993b203>:1: FutureWarning: The default value of numeric_only in DataFrame.median is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  credit_df = credit_df.fillna(credit_df.median())

credit_df.isna().sum()
--------------------------------------
# 결과
Age                            0
Occupation                     0
Annual_Income                  0
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Type_of_Loan                1109
Delay_from_due_date            0
Num_of_Delayed_Payment         0
Num_Credit_Inquiries           0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Credit_History_Age             0
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly        0
Payment_Behaviour              0
Monthly_Balance                0
Credit_Score                   0
dtype: int64

credit_df.head()

# 결과

credit_df['Type_of_Loan'] = credit_df['Type_of_Loan'].fillna('No Loan')
credit_df.isna().sum()
-----------------------------------------------------------------------
# 결과
Age                         0
Occupation                  0
Annual_Income               0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Type_of_Loan                0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Num_Credit_Inquiries        0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Credit_History_Age          0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Payment_Behaviour           0
Monthly_Balance             0
Credit_Score                0
dtype: int64

-------------------------------------------------------------------------
type_list = set(credit_df['Type_of_Loan'].str.split(', ').sum())
type_list
-------------------------------------------------------------------------
# 결과
{'Auto Loan',
 'Credit-Builder Loan',
 'Debt Consolidation Loan',
 'Home Equity Loan',
 'Mortgage Loan',
 'No Loan',
 'Not Specified',
 'Payday Loan',
 'Personal Loan',
 'Student Loan'}
 -----------------------------------------------------------------------
 for i in type_list:
    credit_df[i] = credit_df['Type_of_Loan'].apply(lambda x: 1 if i in x else 0)
 
 credit_df.head()

# 결과

credit_df.drop('Type_of_Loan', axis=1, inplace=True)
credit_df.info()
-----------------------------------------------------
# 결과
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10002 entries, 0 to 12498
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       10002 non-null  int64  
 1   Occupation                10002 non-null  object 
 2   Annual_Income             10002 non-null  float64
 3   Num_Bank_Accounts         10002 non-null  int64  
 4   Num_Credit_Card           10002 non-null  int64  
 5   Interest_Rate             10002 non-null  int64  
 6   Num_of_Loan               10002 non-null  int64  
 7   Delay_from_due_date       10002 non-null  int64  
 8   Num_of_Delayed_Payment    10002 non-null  float64
 9   Num_Credit_Inquiries      10002 non-null  float64
 10  Outstanding_Debt          10002 non-null  float64
 11  Credit_Utilization_Ratio  10002 non-null  float64
 12  Credit_History_Age        10002 non-null  float64
 13  Payment_of_Min_Amount     10002 non-null  object 
 14  Total_EMI_per_month       10002 non-null  float64
 15  Amount_invested_monthly   10002 non-null  float64
 16  Payment_Behaviour         10002 non-null  object 
 17  Monthly_Balance           10002 non-null  float64
 18  Credit_Score              10002 non-null  int64  
 19  Debt Consolidation Loan   10002 non-null  int64  
 20  Auto Loan                 10002 non-null  int64  
 21  Home Equity Loan          10002 non-null  int64  
 22  Payday Loan               10002 non-null  int64  
 23  Credit-Builder Loan       10002 non-null  int64  
 24  Mortgage Loan             10002 non-null  int64  
 25  Not Specified             10002 non-null  int64  
 26  Student Loan              10002 non-null  int64  
 27  Personal Loan             10002 non-null  int64  
 28  No Loan                   10002 non-null  int64  
dtypes: float64(9), int64(17), object(3)
memory usage: 2.3+ MB
--------------------------------------------------------------------

credit_df['Occupation'].value_counts()
---------------------------------------------------------------------
_______          673
Lawyer           664
Mechanic         646
Scientist        640
Engineer         640
Architect        632
Teacher          624
Developer        621
Entrepreneur     620
Media_Manager    616
Accountant       611
Doctor           608
Musician         607
Journalist       606
Manager          602
Writer           592
Name: Occupation, dtype: int64

--------------------------------------------------------------------------------
credit_df['Occupation'] = credit_df['Occupation'].replace('_______', 'Unknown')
credit_df['Occupation'].value_counts()
--------------------------------------------------------------------------------
# 결과
Unknown          673
Lawyer           664
Mechanic         646
Scientist        640
Engineer         640
Architect        632
Teacher          624
Developer        621
Entrepreneur     620
Media_Manager    616
Accountant       611
Doctor           608
Musician         607
Journalist       606
Manager          602
Writer           592
Name: Occupation, dtype: int64

-----------------------------------------------------------------------------------
credit_df['Payment_of_Min_Amount'].value_counts()
-----------------------------------------------------------------------------------
# 결과
Yes    5315
No     3489
NM     1198
Name: Payment_of_Min_Amount, dtype: int64
------------------------------------------------------------------------------------

credit_df['Payment_Behaviour'].value_counts()
------------------------------------------------------------------------------------
# 결과
Low_spent_Small_value_payments      2505
High_spent_Medium_value_payments    1794
High_spent_Large_value_payments     1453
Low_spent_Medium_value_payments     1376
High_spent_Small_value_payments     1136
Low_spent_Large_value_payments       994
!@9#%8                               744
Name: Payment_Behaviour, dtype: int64
-----------------------------------------------------------------------------------

credit_df['Payment_Behaviour'] = credit_df['Payment_Behaviour'].replace('!@9#%8', 'Unknown')
credit_df['Payment_Behaviour'].value_counts()
--------------------------------------------------------------------------------------------
# 결과
Low_spent_Small_value_payments      2505
High_spent_Medium_value_payments    1794
High_spent_Large_value_payments     1453
Low_spent_Medium_value_payments     1376
High_spent_Small_value_payments     1136
Low_spent_Large_value_payments       994
Unknown                              744
Name: Payment_Behaviour, dtype: int64

------------------------------------------------------------------------------------------------------------
credit_df = pd.get_dummies(credit_df, columns={'Occupation', 'Payment_of_Min_Amount', 'Payment_Behaviour'})
------------------------------------------------------------------------------------------------------------
# 결과
<ipython-input-75-6ba6af3d5be1>:1: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
  credit_df = pd.get_dummies(credit_df, columns={'Occupation', 'Payment_of_Min_Amount', 'Payment_Behaviour'})
  
credit_df.head()

# 결과

from sklearn.model_selection import train_test_split
len(credit_df)
----------------------------------------------------
# 결과
10002

-------------------------------------------------------------------------------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(credit_df.drop('Credit_Score', axis=1), credit_df['Credit_Score'], test_size=0.2, random_state=10)

2. lightGBM(LGBM)

트리기반 학습 알고리즘인 gradient boosting 방식의 프레임워크
의사결정나무, 랜덤포레스트는 균형 트리 분할(level wise) 방식이라면, LGBM은 리프 중심 트리 분할(leaf wise)
GBM(Gradient Boosting): 모델1을 통해 y를 예측하고, 모델2에 데이터를 넣어 y를 예측, 모델3에 넣어 y를 예측하는 방식
학습하는데 걸리는 시간이 적음(빠른 속도)
메모리 사용량이 상대적으로 적은편
적은 데이터셋을 사용할 경우 과적합 가능성이 매우 큼(일반적으로 데이터가 10000개 이상은 사용해야 함)

from lightgbm import LGBMClassifier
base_model = LGBMClassifier(random_state=10)
base_model.fit(X_train, y_train)

# 결과

pred1 = base_model.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

accuracy_score(y_test, pred1)
-------------------------------
# 결과
0.7351324337831084

-------------------------------
confusion_matrix(y_test, pred1)
-------------------------------
# 결과
array([[407, 148,  27],
       [145, 896,  91],
       [  3, 116, 168]])
       
print(classification_report(y_test, pred1))
--------------------------------------------------------
# 결과
              precision    recall  f1-score   support

           0       0.73      0.70      0.72       582
           1       0.77      0.79      0.78      1132
           2       0.59      0.59      0.59       287

    accuracy                           0.74      2001
   macro avg       0.70      0.69      0.69      2001
weighted avg       0.73      0.74      0.73      2001

----------------------------------------------------------

proba1 = base_model.predict_proba(X_test)
proba1
----------------------------------------------------------
# 결과
array([[6.98737764e-02, 8.38143193e-01, 9.19830307e-02],
       [8.50755391e-01, 1.49084385e-01, 1.60223719e-04],
       [2.74240729e-03, 9.96809944e-01, 4.47648956e-04],
       ...,
       [8.60948233e-01, 1.38960947e-01, 9.08196360e-05],
       [8.96972732e-01, 1.02822925e-01, 2.04342759e-04],
       [6.65766332e-02, 2.29493501e-01, 7.03929866e-01]])
-----------------------------------------------------------

roc_auc_score(y_test, proba1, multi_class='ovr')
------------------------------------------------------------
# 결과
0.8972566425279517

3. RandomizedSearchCV

분류기를 결정하고 해당 분류기의 최적의 하이퍼 파라미터를 찾기 위한 방법
튜닝하고싶은 파라미터를 지정하여 파라미터 값의 범위를 정하고, n_iter값을 설정하여 Random하게 조합하여 반복 적

# n_estimators: 반복 수행하는 트리으 갯수(기본값: 100), 값을 크게 지정하면 학습시간도 오래걸리며, 과적합이 발생할 수 있음
# max_depth: 트리의 최대깊이(기본값:-1)
# learning_rate: 학습률(기본값:0.1)
params = {
    'n_estimators':[100, 300, 500, 1000],
    'max_depth':[-1, 30, 50, 100],
    'num_leaves':[5, 10, 20, 50],
    'learning_rate':[0.01, 0.05, 0.1, 0.5]
}

------------------------------------------------------------------------------------------------------------------
lgbm = LGBMClassifier(random_state=10)

from sklearn.model_selection import RandomizedSearchCV

rand_lgbm = RandomizedSearchCV(lgbm, params, n_iter=30, random_state=10)

rand_lgbm.fit(X_train, y_train)
---------------------------------------------------------------------------
# 결과

rand_lgbm.cv_results_
----------------------------------------------------------------------
# 결과
{'mean_fit_time': array([ 4.5658648 ,  4.21779943,  0.22191768,  0.90757222,  1.59192586,
         2.26492696,  2.47405806,  0.32317824,  2.41929116,  2.22137256,
         0.77717876,  1.22200518, 11.07079234,  3.06908298,  0.99368539,
         1.11443849,  2.51906099,  4.51780829,  2.53231006,  4.29130096,
         0.2664588 ,  2.75201817,  1.99363923, 22.17760978,  6.7546977 ,
         4.74051032,  2.86759224,  1.2348238 ,  1.23866282,  3.96955628]),
 'std_fit_time': array([1.29296252, 3.6896416 , 0.00681215, 0.01657379, 0.94553769,
        0.9201947 , 0.94495989, 0.00780823, 0.88083762, 0.93799943,
        0.02406204, 0.92760223, 3.72010351, 3.42336473, 0.54657991,
        0.02758457, 0.93683229, 1.09257222, 0.95112682, 1.6993746 ,
        0.01278817, 2.54665735, 1.76883832, 6.74077018, 3.23025037,
        1.16056574, 0.87981611, 0.9346451 , 0.03268804, 1.28097227]),
 'mean_score_time': array([0.19462848, 0.06362696, 0.01226988, 0.06756458, 0.05720692,
        0.13102612, 0.10405111, 0.01214552, 0.15181932, 0.09941339,
        0.01983519, 0.04295268, 0.56661935, 0.07857223, 0.03669739,
        0.06129651, 0.09993787, 0.16018553, 0.16217937, 0.06546946,
        0.01431675, 0.08456359, 0.0371521 , 0.65228119, 0.26577258,
        0.27799759, 0.08685555, 0.03979492, 0.08403063, 0.19596457]),
 'std_score_time': array([0.01524097, 0.02729711, 0.00133985, 0.01082821, 0.0013098 ,
        0.00354922, 0.00516856, 0.00165941, 0.0431668 , 0.00593107,
        0.00348709, 0.00921069, 0.26958653, 0.03775454, 0.0143277 ,
        0.00348675, 0.00236711, 0.00683878, 0.05041467, 0.02363663,
        0.00155732, 0.04569411, 0.0135447 , 0.2214836 , 0.10086701,
        0.12464238, 0.00336158, 0.00205019, 0.00277391, 0.03762538]),
 'param_num_leaves': masked_array(data=[50, 20, 5, 5, 20, 5, 20, 10, 5, 50, 50, 10, 50, 20, 5,
                    20, 50, 50, 5, 50, 10, 5, 5, 50, 10, 50, 50, 10, 10,
                    10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[500, 300, 100, 500, 300, 1000, 500, 100, 1000, 300,
                    100, 300, 1000, 300, 300, 300, 300, 500, 1000, 100,
                    100, 500, 300, 1000, 1000, 1000, 300, 300, 500, 1000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[30, 50, 30, 30, 50, 50, 30, 100, -1, 30, 50, 50, 100,
                    100, 30, 30, 30, 100, 100, 30, 30, 100, 100, 100, 100,
                    -1, 100, 30, 100, -1],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_learning_rate': masked_array(data=[0.05, 0.01, 0.1, 0.1, 0.05, 0.5, 0.01, 0.01, 0.05, 0.5,
                    0.01, 0.5, 0.05, 0.5, 0.1, 0.5, 0.05, 0.01, 0.1, 0.5,
                    0.5, 0.01, 0.5, 0.01, 0.01, 0.5, 0.01, 0.1, 0.1, 0.5],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'num_leaves': 50,
   'n_estimators': 500,
   'max_depth': 30,
   'learning_rate': 0.05},
  {'num_leaves': 20,
   'n_estimators': 300,
   'max_depth': 50,
   'learning_rate': 0.01},
  {'num_leaves': 5,
   'n_estimators': 100,
   'max_depth': 30,
   'learning_rate': 0.1},
  {'num_leaves': 5,
   'n_estimators': 500,
   'max_depth': 30,
   'learning_rate': 0.1},
  {'num_leaves': 20,
   'n_estimators': 300,
   'max_depth': 50,
   'learning_rate': 0.05},
  {'num_leaves': 5,
   'n_estimators': 1000,
   'max_depth': 50,
   'learning_rate': 0.5},
  {'num_leaves': 20,
   'n_estimators': 500,
   'max_depth': 30,
   'learning_rate': 0.01},
  {'num_leaves': 10,
   'n_estimators': 100,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 5,
   'n_estimators': 1000,
   'max_depth': -1,
   'learning_rate': 0.05},
  {'num_leaves': 50,
   'n_estimators': 300,
   'max_depth': 30,
   'learning_rate': 0.5},
  {'num_leaves': 50,
   'n_estimators': 100,
   'max_depth': 50,
   'learning_rate': 0.01},
  {'num_leaves': 10,
   'n_estimators': 300,
   'max_depth': 50,
   'learning_rate': 0.5},
  {'num_leaves': 50,
   'n_estimators': 1000,
   'max_depth': 100,
   'learning_rate': 0.05},
  {'num_leaves': 20,
   'n_estimators': 300,
   'max_depth': 100,
   'learning_rate': 0.5},
  {'num_leaves': 5,
   'n_estimators': 300,
   'max_depth': 30,
   'learning_rate': 0.1},
  {'num_leaves': 20,
   'n_estimators': 300,
   'max_depth': 30,
   'learning_rate': 0.5},
  {'num_leaves': 50,
   'n_estimators': 300,
   'max_depth': 30,
   'learning_rate': 0.05},
  {'num_leaves': 50,
   'n_estimators': 500,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 5,
   'n_estimators': 1000,
   'max_depth': 100,
   'learning_rate': 0.1},
  {'num_leaves': 50,
   'n_estimators': 100,
   'max_depth': 30,
   'learning_rate': 0.5},
  {'num_leaves': 10,
   'n_estimators': 100,
   'max_depth': 30,
   'learning_rate': 0.5},
  {'num_leaves': 5,
   'n_estimators': 500,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 5,
   'n_estimators': 300,
   'max_depth': 100,
   'learning_rate': 0.5},
  {'num_leaves': 50,
   'n_estimators': 1000,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 10,
   'n_estimators': 1000,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 50,
   'n_estimators': 1000,
   'max_depth': -1,
   'learning_rate': 0.5},
  {'num_leaves': 50,
   'n_estimators': 300,
   'max_depth': 100,
   'learning_rate': 0.01},
  {'num_leaves': 10,
   'n_estimators': 300,
   'max_depth': 30,
   'learning_rate': 0.1},
  {'num_leaves': 10,
   'n_estimators': 500,
   'max_depth': 100,
   'learning_rate': 0.1},
  {'num_leaves': 10,
   'n_estimators': 1000,
   'max_depth': -1,
   'learning_rate': 0.5}],
 'split0_test_score': array([0.73329169, 0.73016864, 0.7339163 , 0.73891318, 0.73454091,
        0.72517177, 0.72767021, 0.71205497, 0.73766396, 0.73266708,
        0.71767645, 0.71642723, 0.7339163 , 0.72267333, 0.74203623,
        0.72267333, 0.7339163 , 0.72891943, 0.7339163 , 0.72329794,
        0.72829482, 0.7339163 , 0.72392255, 0.73516552, 0.73079325,
        0.72829482, 0.72517177, 0.7270456 , 0.72579638, 0.72517177]),
 'split1_test_score': array([0.725   , 0.741875, 0.754375, 0.740625, 0.7325  , 0.718125,
        0.745625, 0.736875, 0.741875, 0.72125 , 0.735   , 0.728125,
        0.72875 , 0.723125, 0.741875, 0.723125, 0.7275  , 0.738125,
        0.733125, 0.725   , 0.725625, 0.76    , 0.7275  , 0.7325  ,
        0.744375, 0.720625, 0.74    , 0.73625 , 0.72125 , 0.71375 ]),
 'split2_test_score': array([0.738125, 0.7525  , 0.7425  , 0.725   , 0.7275  , 0.726875,
        0.743125, 0.719375, 0.725   , 0.735   , 0.7325  , 0.731875,
        0.736875, 0.73625 , 0.7325  , 0.73625 , 0.734375, 0.74625 ,
        0.730625, 0.73875 , 0.7325  , 0.741875, 0.735   , 0.7375  ,
        0.741875, 0.738125, 0.738125, 0.734375, 0.73875 , 0.72875 ]),
 'split3_test_score': array([0.73125 , 0.7475  , 0.735625, 0.72875 , 0.73    , 0.715625,
        0.745625, 0.726875, 0.73    , 0.7275  , 0.733125, 0.716875,
        0.734375, 0.71125 , 0.73125 , 0.71125 , 0.731875, 0.73625 ,
        0.723125, 0.731875, 0.71875 , 0.73375 , 0.721875, 0.729375,
        0.738125, 0.725   , 0.7475  , 0.72625 , 0.72125 , 0.71375 ]),
 'split4_test_score': array([0.725625, 0.731875, 0.73    , 0.72625 , 0.72625 , 0.715625,
        0.73125 , 0.728125, 0.725   , 0.713125, 0.7325  , 0.72625 ,
        0.72625 , 0.730625, 0.72875 , 0.730625, 0.7225  , 0.730625,
        0.719375, 0.72125 , 0.721875, 0.728125, 0.725625, 0.729375,
        0.740625, 0.718125, 0.735625, 0.726875, 0.720625, 0.71125 ]),
 'mean_test_score': array([0.73065834, 0.74078373, 0.73928326, 0.73190764, 0.73015818,
        0.72028435, 0.73865904, 0.72466099, 0.73190779, 0.72590842,
        0.73016029, 0.72391045, 0.73203326, 0.72478467, 0.73528225,
        0.72478467, 0.73003326, 0.73603389, 0.72803326, 0.72803459,
        0.72540896, 0.73953326, 0.72678451, 0.7327831 , 0.73915865,
        0.72603396, 0.73728435, 0.73015912, 0.72553428, 0.71853435]),
 'std_test_score': array([0.00490687, 0.00866736, 0.00856124, 0.00655385, 0.00306781,
        0.00480428, 0.00765039, 0.00840163, 0.0068052 , 0.00794967,
        0.00630883, 0.00619945, 0.00391658, 0.00843757, 0.00558123,
        0.00843757, 0.00448304, 0.00614264, 0.00576731, 0.00643807,
        0.004806  , 0.01113118, 0.00450876, 0.00320111, 0.004643  ,
        0.00698831, 0.00723753, 0.00425754, 0.00686346, 0.00703216]),
 'rank_test_score': array([13,  1,  3, 12, 16, 29,  5, 27, 11, 22, 14, 28, 10, 25,  8, 25, 17,
         7, 19, 18, 24,  2, 20,  9,  4, 21,  6, 15, 23, 30], dtype=int32)}

# 결과

rand_lgbm.best_params_
--------------------------------------------------------------------------------
# 결과
{'num_leaves': 20, 'n_estimators': 300, 'max_depth': 50, 'learning_rate': 0.01}

---------------------------------------------------------------------------------
lgbm =LGBMClassifier(random_state=10,num_leaves=20,n_estomators=300,max_depth=50,learning_rate=0.01)

lgbm.fit(X_train,y_train)

# 결과

proba = lgbm.predict_proba(X_test)

proba = lgbm.predict_proba(X_test)
roc_auc_score(y_test, proba, multi_class='ovr')
-------------------------------------------------
# 결과
0.9011382504287848

728x90