deep-outlier-detection/utilities.py at master · GuansongPang/deep-outlier-detection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: Guansong Pang

Source code for the REPEN algorithm in KDD'18. See the following paper for detail.
Guansong Pang, Longbing Cao, Ling Chen, and Huan Liu. 2018. Learning Representations
of Ultrahigh-dimensional Data for Random Distance-based Outlier Detection.
In KDD 2018: 24th ACM SIGKDD International Conferenceon Knowledge Discovery &
Data Mining, August 19–23, 2018, London, UnitedKingdom.

"""

import pandas as pd
import numpy as np
from sklearn.metrics import auc,roc_curve, precision_recall_curve, average_precision_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.externals.joblib import Memory
from sklearn.datasets import load_svmlight_file

mem = Memory("/home/gupang/Data/mycache")

@mem.cache
def get_data_from_svmlight_file(path):
    data = load_svmlight_file(path)
    return data[0], data[1]

def dataLoading(path):
    # loading data
    df = pd.read_csv(path)

    labels = df['class']

    x_train_df = df.drop(['class'], axis=1)

    x_train = x_train_df.values
    print(x_train.shape)

    return x_train, labels;

def rescaling(x):
    min_max_scaler = preprocessing.MinMaxScaler()
    x = min_max_scaler.fit_transform(x)
    return x;


def cutoff(values, th = 1.7321):
    sorted_indices = np.argsort(values, axis=0)
#    print(sorted_indices)
    values = values[sorted_indices, 0]
#    print(values)
    v_mean = np.mean(values)
    v_std = np.std(values)
    th = v_mean + th * v_std #1.7321
#    print(th)
    outlier_ind = np.where(values > th)[0]
    inlier_ind = np.where(values <= th)[0]
#    print(sorted_indices[np.where(sorted_indices == outlier_ind)])
    outlier_ind = sorted_indices[outlier_ind]
    inlier_ind = sorted_indices[inlier_ind]
#    print(outlier_ind)
    #print(labels[ind])
    return inlier_ind, outlier_ind;
#    return outlier_ind, inlier_ind;


def cutoff_unsorted(values, th = 1.7321):
#    print(values)
    v_mean = np.mean(values)
    v_std = np.std(values)
    th = v_mean + th * v_std #1.7321
    if th >= np.max(values): # return the top-10 outlier scores
        temp = np.sort(values)
        th = temp[-11]
    outlier_ind = np.where(values > th)[0]
    inlier_ind = np.where(values <= th)[0]
    return inlier_ind, outlier_ind;

def aucPerformance(mse, labels):
    fpr, tpr, thresholds = roc_curve(labels, mse, pos_label = 1 )
    roc_auc = auc(fpr, tpr)
    print(roc_auc)
#    plt.title('Receiver Operating Characteristic')
#    plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
#    plt.legend(loc='lower right')
#    plt.plot([0,1],[0,1],'r--')
#    plt.xlim([-0.001, 1])
#    plt.ylim([0, 1.001])
#    plt.ylabel('True Positive Rate')
#    plt.xlabel('False Positive Rate')
#    plt.show();
    return roc_auc;

def prcPerformance(scores, labels):
    precision, recall, thresholds = precision_recall_curve(labels, scores)
    print(precision)


def normalization(scores):
    total = sum(scores)
    scores = (total - scores) / total
    return scores

def writeOutlierScores(scores, labels, name):
    csv_file = open('./outlierscores/' + name + '.csv', 'w')
#"w" indicates that you're writing strings to the file

    columnTitleRow = 'class,score\n'
    csv_file.write(columnTitleRow)

    for idx in range(0, len(scores)):
        row = str(labels[idx]) + "," + str(scores[idx][0]) + "\n"
        csv_file.write(row)

def writeRepresentation(data, labels, dim, name):
    path = ('../data/representation/' + name + '.csv')
#"w" indicates that you're writing strings to the file
    attr_names = [0] * (dim + 1)
    for i in range(0, dim):
        attr_names[i]=  'attr' + str(i)


    attr_names[dim] = 'class'
    labels = labels.reshape(len(labels), 1)
    data = np.concatenate((data, labels), axis = 1)
    df = pd.DataFrame(data)
    df.to_csv(path, header = attr_names)

def writeResults(name, dim, auc, path = "./results/auc_performance.csv", std_auc = 0.0):
    csv_file = open(path, 'a')
    row = name + "," + str(dim)+ "," + str(auc) + "," + str(std_auc) + "\n"
    csv_file.write(row)

def visualizeData(data, labels, name):
    plt.figure(figsize=(5, 5))
    plt.plot(data[labels == 1, 0], data[labels == 1, 1], 'ro')
    plt.plot(data[labels != 1, 0], data[labels != 1, 1], 'bo')
    plt.title('2-D ' + name)
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.legend(['outliers', 'inliers'], loc='upper right')
    plt.show()