Imagination is more important than knowledge (지식보다 상상)

incredible.egloos.com

가장 쉬운 Mahalanobis distance

incredible.egloos.com/7539231
가장 쉬운 Mahalanobis distance

데이터에서 얻을 수 있는 상관관계를 고려한 체감 거리. 통계적으로 느끼는 거리.
단위가 없다.
데이터 크기에 무관하게 정의된다.

평균과의 거리가 표준편차의 몇 배인지를 나타내는 값.

평균에서 멀어진 정도를 표시함. 표준편차의 배수로 표시함.
평균벡터와 공분산행렬을 계산. 
outlier  판단하기 위한 거리의 문턱값은 백분위수를 이용

예를 들어, PCA처럼 데이터를 처리한 경우, 새로운 좌표를 선택한 경우, 그 공간에서 거리를 정의할 수 있다.
차원축소의 한 가지 방법: PCA

import numpy as np
from scipy.spatial.distance import euclidean
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
def mahalanobis_distance(p, distr):
    # p: a point
    # distr : a distribution
    # covariance matrix
    cov = np.cov(distr, rowvar=False)
    # average of the points in distr
    avg_distri = np.average(distr, axis=0)
    dis = mahalanobis(p, avg_distri, cov)
    return dis

X = np.array([[1,2], [2,2], [3,3]])
cov = np.cov(X, rowvar=False)
covI = np.linalg.inv(cov)
mean=np.mean(X)
maha = mahalanobis(X[0], X[1], covI)
pca = PCA(whiten=True)
X_transformed= pca.fit_transform(X)
print('Mahalanobis distance: '+str(maha))
print('Euclidean distance: '+str(euclidean(X_transformed[0],X_transformed[1])))

p=[1,2]
dist= mahalanobis_distance(p, X_transformed)
print(dist)

Mahalanobis distance: 2.0
Euclidean distance: 2.0000000000000004
2.23606797749979

---------------------------------------------------------------------------------------------------------------------

from scipy.spatial import distance
iv = [[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]
distance.mahalanobis([1, 0, 0], [0, 1, 0], iv)
1.0
distance.mahalanobis([0, 2, 0], [0, 1, 0], iv)
1.0
distance.mahalanobis([2, 0, 0], [0, 1, 0], iv)
1.7320508075688772

---------------------------------------------------------------------------------------------------------------------

# importing numpy library
import numpy as np
 
# Function to calculate Chi-distance
def chi2_distance(A, B):
 
    # compute the chi-squared distance using above formula
    chi = 0.5 * np.sum([((a - b) ** 2) / (a + b)
                      for (a, b) in zip(A, B)])
 
    return chi
 
# main function
if __name__== "__main__":
    a = [1, 2, 13, 5, 45, 23]
    b = [67, 90, 18, 79, 24, 98]
 
    result = chi2_distance(a, b)
    print("The Chi-square distance is :", result)

---------------------------------------------------------------------------------------------------------------------

from sklearn.neighbors import KNeighborsClassifier

X = [[0], [1], [2], [3]]
y = [0, 0, 1, 1]

knn = KNeighborsClassifier(n_neighbors=1, weights="distance", metric="euclidean")
knn.fit(X, y)
knn.predict([[1.1]])
knn.predict_proba([[1.1]])
dist, index = knn.kneighbors([[1.1]])

[0]
[[1. 0.]]
[[0.1]] [[1]]


from sklearn.neighbors import NearestNeighbors
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)
print( indices)
print( distances)

[[0 1]
 [1 0]
 [2 1]
 [3 4]
 [4 3]
 [5 4]]
[[0.         1.        ]
 [0.         1.        ]
 [0.         1.41421356]
 [0.         1.        ]
 [0.         1.        ]
 [0.         1.41421356]]


---------------------------------------------------------------------------------------------------------------------


import numpy as np
import pandas as pd
import scipy.stats as stats

#create dataframe with three columns 'A', 'B', 'C'
np.random.seed(10)
data = pd.DataFrame( np.random.normal(0, 10, size=(100, 3)), columns=['A', 'B', 'C'])
print(data[:10])

#find absolute value of z-score for each observation
z = np.abs(stats.zscore(data))
#only keep rows in dataframe with all z-scores less than absolute value of 3
data_clean = data[(z<3).all(axis=1)]
#find how many rows are left in the dataframe
print( data_clean.shape)

#find Q1, Q3, and interquartile range for each column
Q1 = data.quantile(q=.25)
Q3 = data.quantile(q=.75)
IQR = data.apply(stats.iqr)
#only keep rows in dataframe that have values within 1.5*IQR of Q1 and Q3
data_clean = data[~((data < (Q1-1.5*IQR)) | (data > (Q3+1.5*IQR))).any(axis=1)]
#find how many rows are left in the dataframe
print( data_clean.shape)

           A          B          C
0  13.315865   7.152790 -15.454003
1  -0.083838   6.213360  -7.200856
2   2.655116   1.085485   0.042914
3  -1.746002   4.330262  12.030374
4  -9.650657  10.282741   2.286301
5   4.451376 -11.366022   1.351369
6  14.845370 -10.798049 -19.777283
7 -17.433723   2.660702  23.849673
8  11.236913  16.726222   0.991492
9  13.979964  -2.712480   6.132042
(99, 3)
(89, 3)

---------------------------------------------------------------------------------------------------------------------

outlier 검출에 활용된 예제:
평균벡터와 공분산행렬을 계산. 
outlier  판단하기 위한 거리의 문턱값은 백분위수를 이용

---------------------------------------------------------------------------------------------------------------------

import numpy as np
import pandas as pd 
import scipy as stats

data = {'score': [91, 93, 72, 87, 86, 73, 68, 87, 78, 99, 95, 76, 84, 96, 76, 80, 83, 84, 73, 74],
        'hours': [16, 6, 3, 1, 2, 3, 2, 5, 2, 5, 2, 3, 4, 3, 3, 3, 4, 3, 4, 4],
        'prep': [3, 4, 0, 3, 4, 0, 1, 2, 1, 2, 3, 3, 3, 2, 2, 2, 3, 3, 2, 2],
        'grade': [70, 88, 80, 83, 88, 84, 78, 94, 90, 93, 89, 82, 95, 94, 81, 93, 93, 90, 89, 89]
        }

df = pd.DataFrame(data,columns=['score', 'hours', 'prep','grade'])

#create function to calculate Mahalanobis distance
def mahalanobis(x=None, data=None, cov=None):
    x_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(x_mu, inv_covmat)
    mahal = np.dot(left, x_mu.T)
    return mahal.diagonal()

#create new column in dataframe that contains Mahalanobis distance for each row
df['mahalanobis'] = mahalanobis(x=df, data=df[['score', 'hours', 'prep', 'grade']])

from scipy.stats import chi2
#calculate p-value for each mahalanobis distance    3= 4-1
df['p'] = 1 - chi2.cdf(df['mahalanobis'], 3)

#display p-values for first five rows in dataframe
df.head()


---------------------------------------------------------------------------------------------------------------------

딕셔너리를 만드는 법: 두 개의 리스트로 부터 딕셔너리를 만드는 방법.

fruits = ["Apple", "Pear", "Peach", "Banana"]
prices = [0.35, 0.40, 0.40, 0.28]
fruit_dictionary = dict(zip(fruits, prices))
print(fruit_dictionary)

{'Apple': 0.35, 'Pear': 0.4, 'Peach': 0.4, 'Banana': 0.28}

리스트를 만드는 방법: range 함수를 이용하는 방법.

alist=[ j for j in range(10)  ]

---------------------------------------------------------------------------------------------------------------------
import numpy as np
def create_data(examples=50, features=5, upper_bound=10, outliers_fraction=0.1, extreme=False):
    '''
    This method for testing (i.e. to generate a 2D array of data)
    '''
    data = []
    magnitude = 4 if extreme else 3
    for i in range(examples):
        if (examples - i) <= round((float(examples) * outliers_fraction)):
            data.append(np.random.poisson(upper_bound ** magnitude, features).tolist())
        else:
            data.append(np.random.poisson(upper_bound, features).tolist())
    return np.array(data)
def MahalanobisDist(data, verbose=False):
    covariance_matrix = np.cov(data, rowvar=False)
    if is_pos_def(covariance_matrix):
        inv_covariance_matrix = np.linalg.inv(covariance_matrix)
        if is_pos_def(inv_covariance_matrix):
            vars_mean = []
            for i in range(data.shape[0]):
                vars_mean.append(list(data.mean(axis=0)))
            diff = data - vars_mean
            md = []
            for i in range(len(diff)):
                md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))

            if verbose:
                print("Covariance Matrix:\n {}\n".format(covariance_matrix))
                print("Inverse of Covariance Matrix:\n {}\n".format(inv_covariance_matrix))
                print("Variables Mean Vector:\n {}\n".format(vars_mean))
                print("Variables - Variables Mean Vector:\n {}\n".format(diff))
                print("Mahalanobis Distance:\n {}\n".format(md))
            return md
        else:
            print("Error: Inverse of Covariance Matrix is not positive definite!")
    else:
        print("Error: Covariance Matrix is not positive definite!")
def is_pos_def(A):
    if np.allclose(A, A.T):
        try:
            np.linalg.cholesky(A)
            return True
        except np.linalg.LinAlgError:
            return False
    else:
        return False
data = create_data(15, 3, 10, 0.1)
print("data:\n {}\n".format(data))
MahalanobisDist(data, verbose=True)


---------------------------------------------------------------------------------------------------------------------


---------------------------------------------------------------------------------------------------------------------



---------------------------------------------------------------------------------------------------------------------


# Step 1: Import Libraries
# Synthetic dataset
from sklearn.datasets import make_classification
# Data processing
import pandas as pd
import numpy as np
from collections import Counter
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Model and performance
import tensorflow as tf
from tensorflow.keras import layers, losses
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Step 2: Create Dataset With Anomalies
# Create an imbalanced dataset
X, y = make_classification(n_samples=100000, n_features=32, n_informative=32,
                           n_redundant=0, n_repeated=0, n_classes=2,
                           n_clusters_per_class=1,
                           weights=[0.995, 0.005],
                           class_sep=0.5, random_state=0)

# Step 3: Train Test Split
# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
# Check the number of records
print('The number of records in the training dataset is', X_train.shape[0])
print('The number of records in the test dataset is', X_test.shape[0])
print(
    f"The training dataset has {sorted(Counter(y_train).items())[0][1]} records for the majority class and {sorted(Counter(y_train).items())[1][1]} records for the minority class.")

# Step 4: Autoencoder Algorithm For Anomaly Detection
# No code for this step
# Step 5: Autoencoder Model Training
# Keep only the normal data for the training dataset
X_train_normal = X_train[np.where(y_train == 0)]
# Input layer
input = tf.keras.layers.Input(shape=(32,))
# Encoder layers
encoder = tf.keras.Sequential([
    layers.Dense(16, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(4, activation='relu')])(input)
# Decoder layers
decoder = tf.keras.Sequential([
    layers.Dense(8, activation="relu"),
    layers.Dense(16, activation="relu"),
    layers.Dense(32, activation="sigmoid")])(encoder)
# Create the autoencoder
autoencoder = tf.keras.Model(inputs=input, outputs=decoder)

# Compile the autoencoder
autoencoder.compile(optimizer='adam', loss='mae')
# Fit the autoencoder
history = autoencoder.fit(X_train_normal, X_train_normal,
                          epochs=50,
                          batch_size=64,
                          validation_data=(X_test, X_test),
                          shuffle=True)
# Summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Step 6: Autoencoder Anomaly Detection Threshold
# Predict anomalies/outliers in the training dataset
prediction = autoencoder.predict(X_test)
# Get the mean absolute error between actual and reconstruction/prediction
prediction_loss = tf.keras.losses.mae(prediction, X_test)
# Check the prediction loss threshold for 2% of outliers
loss_threshold = np.percentile(prediction_loss, 98)
print(
    f'The prediction loss threshold for 2% of outliers is {loss_threshold:.2f}')
# Visualize the threshold
sns.histplot(prediction_loss, bins=30, alpha=0.8)
plt.axvline(x=loss_threshold, color='orange')

# Step 7: Autoencoder Anomaly Dectection Performance
# Check the model performance at 2% threshold
threshold_prediction = [
    0 if i < loss_threshold else 1 for i in prediction_loss]
# # Check the prediction performance
print(classification_report(y_test, threshold_prediction))


---------------------------------------------------------------------------------------------------------------------

from sklearn.cluster import OPTICS
from sklearn.datasets import make_blobs
from numpy import quantile, where, random
import matplotlib.pyplot as plt

random.seed(123)
x, _ = make_blobs(n_samples=350, centers=1, cluster_std=.4, center_box=(20, 5))

plt.scatter(x[:, 0], x[:, 1])
plt.grid(True)
plt.show()

model = OPTICS().fit(x)
print(model)

scores = model.core_distances_

thresh = quantile(scores, .98)
print(thresh)

index = where(scores >= thresh)
values = x[index]
print(values)

plt.scatter(x[:, 0], x[:, 1])
plt.scatter(values[:, 0], values[:, 1], color='r')
plt.legend(("normal", "anomal"), loc="best", fancybox=True, shadow=True)
plt.grid(True)
plt.show()

---------------------------------------------------------------------------------------------------------------------



덧글

  • 바죠 2022/04/18 11:28 답글

    https://news.zum.com/articles/74781351?cm=front_nb&selectTab=&r=12&thumb=0&msclkid=3400dd0fbebf11ec8ee08fa9f9b12371

    https://news.naver.com/main/read.naver?mode=LS2D&mid=shm&sid1=105&sid2=283&oid=092&aid=0002254078

  • 바죠 2022/04/23 20:39 답글

    https://github.com/jakevdp/PythonDataScienceHandbook/tree/master/notebooks

    https://github.com/jakevdp/sklearn_tutorial/tree/master/notebooks

    https://github.com/jakevdp/WhirlwindTourOfPython


닉네임
비밀번호
블로그
비공개

프로필

Computo, ergo sum. I compute, therefore I am. Come for the knowledge, Stay for the imagination. Carpe diem, Memento mori, Amor fati by 바죠
작성자 블로그 가기