forked from arogozhnikov/hep_ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
metrics_utils.py
258 lines (193 loc) · 10.4 KB
/
metrics_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
from __future__ import division, print_function, absolute_import
import numpy
from sklearn.utils.validation import column_or_1d
from .commonutils import check_sample_weight, sigmoid_function
__author__ = 'Alex Rogozhnikov'
def prepare_distribution(data, weights):
"""Prepares the distribution to be used later in KS and CvM,
merges equal data, computes (summed) weights and cumulative distribution.
All output arrays are of same length and correspond to each other.
:param data: array of shape [n_samples]
:param weights: array of shape [n_samples]
:return: tuple with (prepared_data, prepared_weights, prepared_cdf),
components are three parallel arrays of shape [n_unique_values]
"""
weights = weights / numpy.sum(weights)
prepared_data, indices = numpy.unique(data, return_inverse=True)
prepared_weights = numpy.bincount(indices, weights=weights)
prepared_cdf = compute_cdf(prepared_weights)
return prepared_data, prepared_weights, prepared_cdf
# region Helpful functions to work with bins and groups
"""
There are two basic approaches to handle are bins and knn.
Here they are represented as bins and (!) groups.
The difference between bins and groups: each event belongs to one and only one bin,
in the case of groups each event may belong to several groups.
Knn is one particular case of groups, bins can be reduced to groups either
Bin_indices is an array, where for each event it's bin is written:
bin_indices = [0, 0, 1, 2, 2, 4]
Group_indices is list, each item is indices of events in some group
group_indices = [[0,1], [2], [3,4], [5]]
Group matrix is another way to write group_indices,
this is sparse matrix of shape [n_groups, n_samples],
group_matrix[group_id, sample_id] = 1, if event belong to cell, 0 otherwise
While bin indices are computed for all the events together, group indices
are typically computed only for events of some particular class.
"""
def compute_bin_indices(X_part, bin_limits=None, n_bins=20):
"""For arbitrary number of variables computes the indices of data,
the indices are unique numbers of bin from zero to \prod_j (len(bin_limits[j]) 1)
:param X_part: columns along which binning is done
:param bin_limits: array of edges between bins.
If bin_limits is not provided, they are computed using data.
:type X_part: numpy.ndarray
"""
if bin_limits is None:
bin_limits = []
for variable_index in range(X_part.shape[1]):
variable_data = X_part[:, variable_index]
bin_limits.append(numpy.linspace(numpy.min(variable_data), numpy.max(variable_data), n_bins 1)[1: -1])
bin_indices = numpy.zeros(len(X_part), dtype=numpy.int)
for axis, bin_limits_axis in enumerate(bin_limits):
bin_indices *= (len(bin_limits_axis) 1)
bin_indices = numpy.searchsorted(bin_limits_axis, X_part[:, axis])
return bin_indices
def bin_to_group_indices(bin_indices, mask):
""" Transforms bin_indices into group indices, skips empty bins
:type bin_indices: numpy.array, each element in index of bin this event belongs, shape = [n_samples]
:type mask: numpy.array, boolean mask of indices to split into bins, shape = [n_samples]
:rtype: list(numpy.array), each element is indices of elements in some bin
"""
assert len(bin_indices) == len(mask), "Different length"
bins_id = numpy.unique(bin_indices)
result = list()
for bin_id in bins_id:
result.append(numpy.where(mask & (bin_indices == bin_id))[0])
return result
def group_indices_to_groups_matrix(group_indices, n_events):
"""
:param group_indices: array, each component corresponds to group
(element = list with indices of events belonging to group)
:return: sparse matrix of shape [n_groups, n_samples],
one if particular event belongs to particular category, 0 otherwise
"""
from scipy import sparse
groups_matrix = sparse.lil_matrix((len(group_indices), n_events))
for group_id, events_in_group in enumerate(group_indices):
groups_matrix[group_id, events_in_group] = 1
return sparse.csr_matrix(groups_matrix)
# endregion
# region Supplementary uniformity-related functions (to measure flatness of predictions)
def compute_cdf(ordered_weights):
"""Computes cumulative distribution function (CDF) by ordered weights,
be sure that sum(ordered_weights) == 1.
Minor difference: using symmetrized version
F(x) = 1/2 (F(x-0) F(x 0))
"""
return numpy.cumsum(ordered_weights) - 0.5 * ordered_weights
def compute_bin_weights(bin_indices, sample_weight):
assert len(bin_indices) == len(sample_weight), 'Different lengths of array'
result = numpy.bincount(bin_indices, weights=sample_weight)
return result / numpy.sum(result)
def compute_divided_weight(group_matrix, sample_weight):
"""Divided weight takes into account that different events
are met different number of times """
occurences = numpy.array(group_matrix.sum(axis=0)).flatten()
return sample_weight / numpy.maximum(occurences, 1)
def compute_group_weights(group_matrix, sample_weight):
"""
Group weight = sum of divided weights of indices inside that group.
"""
divided_weight = compute_divided_weight(group_matrix=group_matrix, sample_weight=sample_weight)
result = group_matrix.dot(divided_weight)
return result / numpy.sum(result)
def compute_bin_efficiencies(y_score, bin_indices, cut, sample_weight, minlength=None):
"""Efficiency of bin = total weight of (signal) events that passed the cut
in the bin / total weight of signal events in the bin.
Returns small negative number for empty bins"""
y_score = column_or_1d(y_score)
assert len(y_score) == len(sample_weight) == len(bin_indices), "different size"
if minlength is None:
minlength = numpy.max(bin_indices) 1
bin_total = numpy.bincount(bin_indices, weights=sample_weight, minlength=minlength)
passed_cut = y_score > cut
bin_passed_cut = numpy.bincount(bin_indices[passed_cut],
weights=sample_weight[passed_cut], minlength=minlength)
return bin_passed_cut / numpy.maximum(bin_total, 1)
def compute_group_efficiencies_by_indices(y_score, groups_indices, cut, divided_weight=None, smoothing=0.0):
""" Provided cut, computes efficiencies inside each bin.
:param divided_weight: weight for each event, divided by the number of it's occurences """
y_score = column_or_1d(y_score)
divided_weight = check_sample_weight(y_score, sample_weight=divided_weight)
# with smoothing=0, this is 0 or 1, latter for passed events.
passed_cut = sigmoid_function(y_score - cut, width=smoothing)
if isinstance(groups_indices, numpy.ndarray) and numpy.ndim(groups_indices) == 2:
# this speedup is specially for knn
result = numpy.average(numpy.take(passed_cut, groups_indices),
weights=numpy.take(divided_weight, groups_indices),
axis=1)
else:
result = numpy.zeros(len(groups_indices))
for i, group in enumerate(groups_indices):
result[i] = numpy.average(passed_cut[group], weights=divided_weight[group])
return result
def compute_group_efficiencies(y_score, groups_matrix, cut, divided_weight=None, smoothing=0.0):
""" Provided cut, computes efficiencies inside each bin.
:param divided_weight: weight for each event, divided by the number of it's occurences """
y_score = column_or_1d(y_score)
divided_weight = check_sample_weight(y_score, sample_weight=divided_weight)
# with smoothing=0, this is 0 or 1, latter for passed events.
passed_cut = sigmoid_function(y_score - cut, width=smoothing)
passed_weight = groups_matrix.dot(divided_weight * passed_cut)
total_weight = groups_matrix.dot(divided_weight)
return passed_weight / numpy.maximum(total_weight, 1e-10)
def weighted_deviation(a, weights, power=2.):
""" sum weight * |x - x_mean|^power, measures deviation from mean """
mean = numpy.average(a, weights=weights)
return numpy.average(numpy.abs(mean - a) ** power, weights=weights)
# endregion
# region Special methods for uniformity metrics
def theil(x, weights):
"""Theil index of array with regularization"""
assert numpy.all(x >= 0), "negative numbers can't be used in Theil"
x_mean = numpy.average(x, weights=weights)
normed = x / x_mean
# to avoid problems with log of negative number.
normed[normed < 1e-20] = 1e-20
return numpy.average(normed * numpy.log(normed), weights=weights)
def _ks_2samp_fast(prepared_data1, data2, prepared_weights1, weights2, cdf1):
"""Pay attention - prepared data should not only be sorted,
but equal items should be merged (by summing weights),
data2 should not have elements larger then max(prepared_data1) """
indices = numpy.searchsorted(prepared_data1, data2)
weights2 /= numpy.sum(weights2)
prepared_weights2 = numpy.bincount(indices, weights=weights2, minlength=len(prepared_data1))
cdf2 = compute_cdf(prepared_weights2)
return numpy.max(numpy.abs(cdf1 - cdf2))
def ks_2samp_weighted(data1, data2, weights1, weights2):
"""Kolmogorov-Smirnov distance, almost the same as ks2samp from scipy.stats, but this version supports weights.
:param data1: array-like of shape [n_samples1]
:param data2: array-like of shape [n_samples2]
:param weights1: None or array-like of shape [n_samples1]
:param weights2: None or array-like of shape [n_samples2]
:return: float, Kolmogorov-Smirnov distance.
"""
x = numpy.unique(numpy.concatenate([data1, data2]))
weights1 = weights1 / numpy.sum(weights1) * 1.
weights2 = weights2 / numpy.sum(weights2) * 1.
inds1 = numpy.searchsorted(x, data1)
inds2 = numpy.searchsorted(x, data2)
w1 = numpy.bincount(inds1, weights=weights1, minlength=len(x))
w2 = numpy.bincount(inds2, weights=weights2, minlength=len(x))
F1 = compute_cdf(w1)
F2 = compute_cdf(w2)
return numpy.max(numpy.abs(F1 - F2))
def _cvm_2samp_fast(prepared_data1, data2, prepared_weights1, weights2, cdf1, power=2.):
"""Pay attention - prepared data should not only be sorted,
but equal items should be merged (by summing weights) """
indices = numpy.searchsorted(prepared_data1, data2)
weights2 /= numpy.sum(weights2)
prepared_weights2 = numpy.bincount(indices, weights=weights2, minlength=len(prepared_data1))
cdf2 = compute_cdf(prepared_weights2)
return numpy.average(numpy.abs(cdf1 - cdf2) ** power, weights=prepared_weights1)
# endregion