가장 쉬운 LightGBM 모델 [분류, 회귀] by 바죠


가장 쉬운 LightGBM 모델 [분류, 회귀]


2012년 이후를 흔히 딥러닝의 시대라고 한다.

많은 사람들이 오해하는 것 중 하나는 딥러닝이 분류와 회귀에서 항상 가장 뛰어날 것이다.
이것은 완전한 오해이다.
딥러닝의 성능은 이미지, 글, 오디오 데이터에서만 최고의 성능을 발휘한다.
연속적으로 변화하는 자연스러운 데이터에서만 그렇다.

그 이외의 일반적인 불연속적 표 형식의 데이터에 대해서는 결정나무 (decision tree) 방식을 활용하는 LightGBM, XGBoost, Catboost 등의 기계학습 방법이 소위 딥러닝 방법보다 더 뛰어난 성능을 발휘한다.

2012년 이후 혜성처럼 등장한 딥러닝을 신봉하는 사람들 입장에서는 상당히 자존심 상하는 상황이다.
하지만, 어쩔수 없이 받아들여만 하는 사실이다.
이러한 방법들(LightGBM, XGBoost, Catboost)은  각각 hyperparameter 들을 최적화 할 경우, 보다 더 좋은 성능을 각각 발휘한다.
한 발 더 나아가서,
hyperparameter들을 기계학습 방법중 하나인 베이지안 옵티마이제이션을 활용하여 쉽게 최적화 할 수 있다.

기계학습 방법의 최고 성능을 또 다른 기계학습 방법으로 쉽게 얻어내는 것이다.
인공지능, 기계학습, 딥러닝, 그렇게 단순하지 않다.

베이지안 옵티마이제이션을 활용한 hyperparameter 최적화: http://incredible.egloos.com/7479039

LightGBM과 베이지안 옵티마이제이션을 아래와 같이 동시에 활용할 경우, scikit-learn 라이버러리 모두를 집으로 보낼 수 있다.
가능하다. 모델의 성능이 뛰어나다. 계산시간도 획기적으로 줄여준다.
---------------------------------------------------------------------------------------------------------------------

import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import accuracy_score,confusion_matrix
import numpy as np
def lgb_evaluate(numLeaves, maxDepth, scaleWeight, minChildWeight, subsample, colSam):
    reg=lgb.LGBMRegressor(num_leaves=31, max_depth= 2,scale_pos_weight= scaleWeight, min_child_weight= minChildWeight, subsample= 0.4, colsample_bytree= 0.4, learning_rate=0.05,   n_estimators=20)
#   scores = cross_val_score(reg, train_x, train_y, cv=5, scoring='roc_auc')
    scores = cross_val_score(reg, train_x, train_y, cv=5, scoring='neg_mean_squared_error')
    return np.mean(scores)
def bayesOpt(train_x, train_y):
    lgbBO = BayesianOptimization(lgb_evaluate, {  'numLeaves':  (5, 90),  'maxDepth': (2, 90),   'scaleWeight': (1, 10000),  'minChildWeight': (0.01, 70), 'subsample': (0.4, 1), 'colSam': (0.4, 1) })
    lgbBO.maximize(init_points=5, n_iter=50)
    print(lgbBO.res)
boston = load_boston()
X, y = boston.data, boston.target
train_x, X_test, train_y, y_test = train_test_split(X, y, test_size=0.2)
bayesOpt(train_x, train_y)

---------------------------------------------------------------------------------------------------------------------

import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from numpy import loadtxt
from sklearn.metrics import accuracy_score,confusion_matrix
import numpy as np
def lgb_evaluate(numLeaves, maxDepth, scaleWeight, minChildWeight, subsample, colSam):
    clf = lgb.LGBMClassifier(
        objective = 'binary',
        metric= 'auc',
        reg_alpha= 0,
        reg_lambda= 2,
#       bagging_fraction= 0.999,
        min_split_gain= 0,
        min_child_samples= 10,
        subsample_freq= 3,
#       subsample_for_bin= 50000,
#       n_estimators= 9999999,
        n_estimators= 99,
        num_leaves= int(numLeaves),
        max_depth= int(maxDepth),
        scale_pos_weight= scaleWeight,
        min_child_weight= minChildWeight,
        subsample= subsample,
        colsample_bytree= colSam,
        verbose =-1)
    scores = cross_val_score(clf, train_x, train_y, cv=5, scoring='roc_auc')
    return np.mean(scores)
def bayesOpt(train_x, train_y):
    lgbBO = BayesianOptimization(lgb_evaluate, {   'numLeaves':  (5, 90),  'maxDepth': (2, 90), 'scaleWeight': (1, 10000),  'minChildWeight': (0.01, 70),  'subsample': (0.4, 1),  'colSam': (0.4, 1)   })
    lgbBO.maximize(init_points=5, n_iter=50)
    print(lgbBO.res)
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
X = dataset[:,0:8]
y = dataset[:,8]
train_x, X_test, train_y, y_test = train_test_split(X, y, test_size=0.2)
bayesOpt(train_x, train_y)

---------------------------------------------------------------------------------------------------------------------

# coding: utf-8
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='t')

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

print('Starting training...')
# train
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

# feature importances
print('Feature importances:', list(gbm.feature_importances_))


# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False


print('Starting training with custom eval function...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=rmsle,
        early_stopping_rounds=5)


# another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE)
def rae(y_true, y_pred):
    return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False


print('Starting training with multiple custom eval functions...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=[rmsle, rae],
        early_stopping_rounds=5)

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmsle of prediction is:', rmsle(y_test, y_pred)[1])
print('The rae of prediction is:', rae(y_test, y_pred)[1])

# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm.best_params_)


# coding: utf-8
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

print('Starting training...')
# train
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

# feature importances
print('Feature importances:', list(gbm.feature_importances_))


# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False


print('Starting training with custom eval function...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=rmsle,
        early_stopping_rounds=5)


# another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE)
def rae(y_true, y_pred):
    return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False


print('Starting training with multiple custom eval functions...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=[rmsle, rae],
        early_stopping_rounds=5)

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmsle of prediction is:', rmsle(y_test, y_pred)[1])
print('The rae of prediction is:', rae(y_test, y_pred)[1])

# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm.best_params_)

# coding: utf-8
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error

print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)


# coding: utf-8
import lightgbm as lgb
import pandas as pd

if lgb.compat.MATPLOTLIB_INSTALLED:
    import matplotlib.pyplot as plt
else:
    raise ImportError('You need to install matplotlib for plot_example.py.')

print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'num_leaves': 5,
    'metric': ('l1', 'l2'),
    'verbose': 0
}

evals_result = {}  # to record eval results for plotting

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_test],
                feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])],
                categorical_feature=[21],
                evals_result=evals_result,
                verbose_eval=10)

print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()

print('Plotting split value histogram...')
ax = lgb.plot_split_value_histogram(gbm, feature='f26', bins='auto')
plt.show()

print('Plotting 54th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain'])
plt.show()

print('Plotting 54th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54')
graph.render(view=True)

>>>aa=np.zeros((3,3))

>>>aa[0,0]=1

>>>aa[0,1]=2

>>>aa[0,2]=3

>>>aa[1,0]=4

>>>aa[1,1]=5

>>>aa[1,2]=6

>>>aa[2,0]=7

>>>aa[2,1]=8

>>>aa[2,2]=9

>>>aa

array([[1.,2., 3.],

       [4., 5., 6.],

       [7., 8., 9.]])

>>>a=pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])

>>>a

   0 1  2

0  1 2  3

1  4 5  6

2  7 8  9

>>>aa

array([[1.,2., 3.],

       [4., 5., 6.],

       [7., 8., 9.]])

>>>a[0]

0    1

1    4

2    7

Name:0, dtype:int64

>>>a.drop(0,axis=1)

   1  2

0  2  3

1  5  6

2 8  9


# Load Library
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
# Step1: Create data set
X, y = make_moons(n_samples=10000, noise=.5, random_state=0)
# Step2: Split the training test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 3: Fit a Decision Tree model as comparison
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)
OUTPUT: 0.756
# Step 4: Fit a Random Forest model, " compared to "Decision Tree model, accuracy go up by 5%
clf = RandomForestClassifier(n_estimators=100, max_features="auto",random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)
OUTPUT: 0.797
# Step 5: Fit a AdaBoost model, " compared to "Decision Tree model, accuracy go up by 10%
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)
OUTPUT:0.833
# Step 6: Fit a Gradient Boosting model, " compared to "Decision Tree model, accuracy go up by 10%
clf = GradientBoostingClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)
OUTPUT:0.834
Note: Parameter - n_estimators stands for how many tree we want to grow


https://scikit-optimize.github.io/stable/auto_examples/hyperparameter-optimization.html#sphx-glr-auto-examples-hyperparameter-optimization-py
--------------------------------------------------------------------------------------------------------------------
"""
============================================
Tuning a scikit-learn estimator with `skopt`
============================================

Gilles Louppe, July 2016
Katie Malone, August 2016
Reformatted by Holger Nahrstaedt 2020

.. currentmodule:: skopt

If you are looking for a :obj:`sklearn.model_selection.GridSearchCV` replacement checkout
:ref:`sphx_glr_auto_examples_sklearn-gridsearchcv-replacement.py` instead.

Problem statement
=================

Tuning the hyper-parameters of a machine learning model is often carried out
using an exhaustive exploration of (a subset of) the space all hyper-parameter
configurations (e.g., using :obj:`sklearn.model_selection.GridSearchCV`), which
often results in a very time consuming operation.

In this notebook, we illustrate how to couple :class:`gp_minimize` with sklearn's
estimators to tune hyper-parameters using sequential model-based optimisation,
hopefully resulting in equivalent or better solutions, but within less
evaluations.

Note: scikit-optimize provides a dedicated interface for estimator tuning via
:class:`BayesSearchCV` class which has a similar interface to those of
:obj:`sklearn.model_selection.GridSearchCV`. This class uses functions of skopt to perform hyperparameter
search efficiently. For example usage of this class, see
:ref:`sphx_glr_auto_examples_sklearn-gridsearchcv-replacement.py`
example notebook.
"""
print(__doc__)
import numpy as np

#############################################################################
# Objective
# =========
# To tune the hyper-parameters of our model we need to define a model,
# decide which parameters to optimize, and define the objective function
# we want to minimize.

from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

boston = load_boston()
X, y = boston.data, boston.target
n_features = X.shape[1]

# gradient boosted trees tend to do well on problems like this
reg = GradientBoostingRegressor(n_estimators=50, random_state=0)

#############################################################################
# Next, we need to define the bounds of the dimensions of the search space
# we want to explore and pick the objective. In this case the cross-validation
# mean absolute error of a gradient boosting regressor over the Boston
# dataset, as a function of its hyper-parameters.

from skopt.space import Real, Integer
from skopt.utils import use_named_args


# The list of hyper-parameters we want to optimize. For each one we define the
# bounds, the corresponding scikit-learn parameter name, as well as how to
# sample values from that dimension (`'log-uniform'` for the learning rate)
space  = [Integer(1, 5, name='max_depth'),
          Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
          Integer(1, n_features, name='max_features'),
          Integer(2, 100, name='min_samples_split'),
          Integer(1, 100, name='min_samples_leaf')]

# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set
# scikit-learn estimator parameters
@use_named_args(space)
def objective(**params):
    reg.set_params(**params)

    return -np.mean(cross_val_score(reg, X, y, cv=5, n_jobs=-1,
                                    scoring="neg_mean_absolute_error"))

#############################################################################
# Optimize all the things!
# ========================
# With these two pieces, we are now ready for sequential model-based
# optimisation. Here we use gaussian process-based optimisation.

from skopt import gp_minimize
res_gp = gp_minimize(objective, space, n_calls=50, random_state=0)

"Best score=%.4f" % res_gp.fun

#############################################################################

print("""Best parameters:
- max_depth=%d
- learning_rate=%.6f
- max_features=%d
- min_samples_split=%d
- min_samples_leaf=%d""" % (res_gp.x[0], res_gp.x[1],
                            res_gp.x[2], res_gp.x[3],
                            res_gp.x[4]))

#############################################################################
# Convergence plot
# ================

from skopt.plots import plot_convergence

plot_convergence(res_gp)
--------------------------------------------------------------------------------------------------------------------




핑백

덧글

  • 바죠 2021/01/13 15:08 # 답글

    https://scikit-optimize.github.io/stable/auto_examples/hyperparameter-optimization.html#sphx-glr-auto-examples-hyperparameter-optimization-py
댓글 입력 영역

최근 포토로그



MathJax