-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathutilities.py
More file actions
144 lines (116 loc) · 4.36 KB
/
utilities.py
File metadata and controls
144 lines (116 loc) · 4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: Guansong Pang
Source code for the REPEN algorithm in KDD'18. See the following paper for detail.
Guansong Pang, Longbing Cao, Ling Chen, and Huan Liu. 2018. Learning Representations
of Ultrahigh-dimensional Data for Random Distance-based Outlier Detection.
In KDD 2018: 24th ACM SIGKDD International Conferenceon Knowledge Discovery &
Data Mining, August 19–23, 2018, London, UnitedKingdom.
"""
import pandas as pd
import numpy as np
from sklearn.metrics import auc,roc_curve, precision_recall_curve, average_precision_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.externals.joblib import Memory
from sklearn.datasets import load_svmlight_file
mem = Memory("/home/gupang/Data/mycache")
@mem.cache
def get_data_from_svmlight_file(path):
data = load_svmlight_file(path)
return data[0], data[1]
def dataLoading(path):
# loading data
df = pd.read_csv(path)
labels = df['class']
x_train_df = df.drop(['class'], axis=1)
x_train = x_train_df.values
print(x_train.shape)
return x_train, labels;
def rescaling(x):
min_max_scaler = preprocessing.MinMaxScaler()
x = min_max_scaler.fit_transform(x)
return x;
def cutoff(values, th = 1.7321):
sorted_indices = np.argsort(values, axis=0)
# print(sorted_indices)
values = values[sorted_indices, 0]
# print(values)
v_mean = np.mean(values)
v_std = np.std(values)
th = v_mean + th * v_std #1.7321
# print(th)
outlier_ind = np.where(values > th)[0]
inlier_ind = np.where(values <= th)[0]
# print(sorted_indices[np.where(sorted_indices == outlier_ind)])
outlier_ind = sorted_indices[outlier_ind]
inlier_ind = sorted_indices[inlier_ind]
# print(outlier_ind)
#print(labels[ind])
return inlier_ind, outlier_ind;
# return outlier_ind, inlier_ind;
def cutoff_unsorted(values, th = 1.7321):
# print(values)
v_mean = np.mean(values)
v_std = np.std(values)
th = v_mean + th * v_std #1.7321
if th >= np.max(values): # return the top-10 outlier scores
temp = np.sort(values)
th = temp[-11]
outlier_ind = np.where(values > th)[0]
inlier_ind = np.where(values <= th)[0]
return inlier_ind, outlier_ind;
def aucPerformance(mse, labels):
fpr, tpr, thresholds = roc_curve(labels, mse, pos_label = 1 )
roc_auc = auc(fpr, tpr)
print(roc_auc)
# plt.title('Receiver Operating Characteristic')
# plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
# plt.legend(loc='lower right')
# plt.plot([0,1],[0,1],'r--')
# plt.xlim([-0.001, 1])
# plt.ylim([0, 1.001])
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show();
return roc_auc;
def prcPerformance(scores, labels):
precision, recall, thresholds = precision_recall_curve(labels, scores)
print(precision)
def normalization(scores):
total = sum(scores)
scores = (total - scores) / total
return scores
def writeOutlierScores(scores, labels, name):
csv_file = open('./outlierscores/' + name + '.csv', 'w')
#"w" indicates that you're writing strings to the file
columnTitleRow = 'class,score\n'
csv_file.write(columnTitleRow)
for idx in range(0, len(scores)):
row = str(labels[idx]) + "," + str(scores[idx][0]) + "\n"
csv_file.write(row)
def writeRepresentation(data, labels, dim, name):
path = ('../data/representation/' + name + '.csv')
#"w" indicates that you're writing strings to the file
attr_names = [0] * (dim + 1)
for i in range(0, dim):
attr_names[i]= 'attr' + str(i)
attr_names[dim] = 'class'
labels = labels.reshape(len(labels), 1)
data = np.concatenate((data, labels), axis = 1)
df = pd.DataFrame(data)
df.to_csv(path, header = attr_names)
def writeResults(name, dim, auc, path = "./results/auc_performance.csv", std_auc = 0.0):
csv_file = open(path, 'a')
row = name + "," + str(dim)+ "," + str(auc) + "," + str(std_auc) + "\n"
csv_file.write(row)
def visualizeData(data, labels, name):
plt.figure(figsize=(5, 5))
plt.plot(data[labels == 1, 0], data[labels == 1, 1], 'ro')
plt.plot(data[labels != 1, 0], data[labels != 1, 1], 'bo')
plt.title('2-D ' + name)
plt.xlabel('x1')
plt.ylabel('x2')
plt.legend(['outliers', 'inliers'], loc='upper right')
plt.show()