scikit-learn, machine learning in python by 바죠


scikit-learn은 파이썬 라이버러리 모듈이다. 특히, 기계학습에 관한 포괄적인 기능을 제공한다.
파이썬을 활용한 기계학습은 사실상 scikit-learn 라이버러리를 활용하는 것을 의미한다.
현실적으로 그렇다. 현재 이보다 더 범용적인 것이 없다.
기계학습을 넘어 딥러닝으로 확장이 가능하다. 다른 라이버러리들도 scikit-learn 라이버러리를 이용하고 참고 하고 있다.


Machine Learning in Python

  • Simple and efficient tools for data mining and data analysis
  • Accessible to everybody, and reusable in various contexts
  • Built on NumPy, SciPy, and matplotlib
  • Open source, commercially usable - BSD license

붓꽃 꽃잎, 꽃받침의 넓이와 길이를 입력으로 꽃 종류를 예측하는 기계학습.

1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm

5. class:
-- Iris Setosa
-- Iris Versicolour
-- Iris Virginica
꽃 종류는  3가지이다.
세가지 종류(Versicolor, Setosa, Virginica)의 꽃을 
Sepal (꽃받침), Petal(꽃잎) 값들로 부터 분류해 버리는 것을 기계학습으로 진행할 수 있다. 
정확히는 Sepal 길이, 폭, Petal 길이, 폭 등 4가지 숫자를 이용한다. 길이는 cm 단위로 측정된 것들이다.
Iris flower dataset
Sir Ronald Aylmer Fisher (1936)

# Sample Decision Tree Classifier
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# load the iris datasets
dataset = datasets.load_iris()
# fit a CART model to the data
model = DecisionTreeClassifier(),
# make predictions
expected =
predicted = model.predict(
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       1.00      1.00      1.00        50
          2       1.00      1.00      1.00        50

avg / total       1.00      1.00      1.00       150

[[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]

from  sklearn import  datasets
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score

gini impurity에 많은 영향을 미칠 수록 중요한 feature이다. 
feature importance가 정의될 수 있다.

import numpy as np
import xgboost as xgb
from sklearn import datasets
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
#from sklearn.externals import joblib
import joblib
from sklearn.metrics import precision_score

iris = datasets.load_iris()
X =
y =

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# use DMatrix for xgbosot
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# use svmlight file for xgboost
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

# set xgboost params
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

#------------- numpy array ------------------
# training and testing - numpy matrices
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)

# extracting most confident predictions
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Numpy array precision:", precision_score(y_test, best_preds, average='macro'))

# ------------- svm file ---------------------
# training and testing - svm file
bst_svm = xgb.train(param, dtrain_svm, num_round)
preds = bst.predict(dtest_svm)

# extracting most confident predictions
best_preds_svm = [np.argmax(line) for line in preds]
print("Svm file precision:",precision_score(y_test, best_preds_svm, average='macro'))
# --------------------------------------------

# dump the models

# save the models for later
joblib.dump(bst, 'bst_model.pkl', compress=True)
joblib.dump(bst_svm, 'bst_svm_model.pkl', compress=True)

from sklearn import datasets
iris = datasets.load_iris()
X =
y =
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print("Train data length:",len(X_train));
print("Test data length:",len(X_test));
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
parameters = {
    'eta': 0.3,
    'silent': True,  # option for logging
    'objective': 'multi:softprob',  # error evaluation for multiclass tasks
    'num_class': 3,  # number of classes to predic
    'max_depth': 3  # depth of the trees in the boosting process
num_round = 20  # the number of training iterations
bst = xgb.train(parameters, dtrain, num_round)
preds = bst.predict(dtest)
print( preds[:5])
Selecting the column that represents the highest probability
(note that, for each line, there is 3 columns, indicating the probability for each class)
import numpy as np
best_preds = np.asarray([np.argmax(line) for line in preds])
from sklearn.metrics import precision_score
print(precision_score(y_test, best_preds, average='macro'))

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
import numpy as np
import lightgbm as lgb

# import some data to play with
iris = datasets.load_iris()
X =[:, :2]  # we only take the first two features.
y =

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
plt.figure(2, figsize=(8, 6))

# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c=y,,
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# To getter a better understanding of interaction of the dimensions
# plot the first three PCA dimensions
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
 , edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.set_ylabel("2nd eigenvector")
ax.set_zlabel("3rd eigenvector")

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
X =  # we only take the first two features.
y =
le = preprocessing.LabelEncoder() #

X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size=0.30, random_state=42)

params = {
          "objective" : "multiclass",
          "num_class" : 4,
          "num_leaves" : 60,
          "max_depth": -1,
          "learning_rate" : 0.01,
          "bagging_fraction" : 0.9,  # subsample
          "feature_fraction" : 0.9,  # colsample_bytree
          "bagging_freq" : 5,        # subsample_freq
          "bagging_seed" : 2018,
          "verbosity" : -1 }

lgtrain, lgval = lgb.Dataset(X_train, y_train), lgb.Dataset(X_test, y_test)
lgbmodel = lgb.train(params, lgtrain, 2000, valid_sets=[lgtrain, lgval], early_stopping_rounds=100, verbose_eval=200)

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
y_pred =np.argmax(lgbmodel.predict(X_test),axis=1)
y_true =y_test

def plot_confusion_matrix(y_true, y_pred, classes,
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
        print('Confusion matrix, without normalization')


    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    return ax

plot_confusion_matrix(y_true, y_pred, classes=classes,
                      title='Confusion matrix, without normalization')
from sklearn.metrics import accuracy_score
print( accuracy_score(y_true, y_pred))


댓글 입력 영역

최근 포토로그