Skip to content

Commit

Permalink
ENH: Patches Nearest Centroid for metric=manhattan for sparse and den…
Browse files Browse the repository at this point in the history
…se data
  • Loading branch information
MechCoder committed Oct 21, 2014
1 parent 535d1f6 commit 6f41bd1
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 6 deletions.
21 changes: 16 additions & 5 deletions sklearn/neighbors/nearest_centroid.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ..metrics.pairwise import pairwise_distances
from ..preprocessing import LabelEncoder
from ..utils.validation import check_array, check_X_y
from ..utils.sparsefuncs import csc_row_median


class NearestCentroid(BaseEstimator, ClassifierMixin):
Expand Down Expand Up @@ -86,8 +87,9 @@ def fit(self, X, y):
y : array, shape = [n_samples]
Target values (integers)
"""
X, y = check_X_y(X, y, ['csr', 'csc'])
if sp.issparse(X) and self.shrink_threshold:
X, y = check_X_y(X, y, ['csc'])
X_sparse = sp.issparse(X)
if X_sparse and self.shrink_threshold:
raise ValueError("threshold shrinking not supported"
" for sparse input")

Expand All @@ -100,16 +102,25 @@ def fit(self, X, y):
raise ValueError('y has less than 2 classes')

# Mask mapping each class to it's members.
self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)
self.centroids_ = np.zeros((n_classes, n_features), dtype=np.float64)
# Number of clusters in each class.
nk = np.zeros(n_classes)

for cur_class in y_ind:
center_mask = y_ind == cur_class
nk[cur_class] = np.sum(center_mask)
if sp.issparse(X):
if X_sparse:
center_mask = np.where(center_mask)[0]
self.centroids_[cur_class] = X[center_mask].mean(axis=0)

# XXX: Update other averaging methods according to the metrics.
if self.metric == "manhattan":
# NumPy does not calculate median of sparse matrices.
if not X_sparse:
self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
else:
self.centroids_[cur_class] = csc_row_median(X[center_mask])
else:
self.centroids_[cur_class] = X[center_mask].mean(axis=0)

if self.shrink_threshold:
dataset_centroid_ = np.mean(X, axis=0)
Expand Down
11 changes: 11 additions & 0 deletions sklearn/neighbors/tests/test_nearest_centroid.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,17 @@ def test_predict_translated_data():
assert_array_equal(y_init, y_translate)


def test_manhattan_metric():
"""Test the manhattan metric."""

clf = NearestCentroid(metric='manhattan')
clf.fit(X, y)
dense_centroid = clf.centroids_
clf.fit(X_csr, y)
assert_array_equal(clf.centroids_, dense_centroid)
assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])


if __name__ == "__main__":
import nose
nose.runmodule()
71 changes: 71 additions & 0 deletions sklearn/utils/sparsefuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# License: BSD 3 clause
import scipy.sparse as sp
import numpy as np
import warnings

from .fixes import sparse_min_max
from .sparsefuncs_fast import csr_mean_variance_axis0 as _csr_mean_var_axis0
Expand Down Expand Up @@ -342,3 +343,73 @@ def count_nonzero(X, axis=None, sample_weight=None):
weights=weights)
else:
raise ValueError('Unsupported axis: {0}'.format(axis))


def csc_row_median(csc):
"""
Find the median across axis 0 of a CSC matrix.
Equivalent to doing np.median(X, axis=0)
Parameters
----------
csc : CSC sparse matrix, shape (n_samples, n_features)
Input data.
Returns
-------
median : ndarray, shape = (n_features,)
Median.
"""
if not isinstance(csc, sp.csc_matrix):
warnings.warn("Non CSC matix passed. Will convert to CSC format.")
csc = sp.csc_matrix(csc)

indptr = csc.indptr
n_samples, n_features = csc.shape

# Highly likely that the median of a sparse matrix is zero.
# Remains zero if the if/else conditions are not checked below.
median = np.zeros(n_features)

for f_ind, ptr in enumerate(indptr[:-1]):
sorted_nonzero = np.sort(csc.data[ptr: indptr[f_ind + 1]])
nz = n_samples - sorted_nonzero.size
zero_ind = np.searchsorted(sorted_nonzero, 0)
neg_idx = sorted_nonzero[: zero_ind]
pos_idx = sorted_nonzero[zero_ind: ]
odd = n_samples % 2
mid_ind = n_samples // 2

if odd:
# Number of negative terms is greater then (n_features + 1) / 2
# which implies the median is negative.
if zero_ind > mid_ind:
median[f_ind] = sorted_nonzero[mid_ind]

# The sum of the negative terms and the number of zeros is less
# than the (n_features + 1) / 2 which implies the median is positive.
elif zero_ind + nz <= mid_ind:
median[f_ind] = pos_idx[mid_ind - nz - neg_idx.size]

else:
# The first two conditions are highly unlikely.
# When the n_features / 2 is the last negative term and
# (n_features / 2) + 1 is zero.
if zero_ind == mid_ind:
median[f_ind] = neg_idx[-1] / 2.

# When the n_features / 2 is zero and (n_features / 2) + 1
# is the first positive term.
elif neg_idx.size + nz == mid_ind:
median[f_ind] = pos_idx[0] / 2.

# Same comments as for the odd case.
elif zero_ind > mid_ind:
median[f_ind] = (sorted_nonzero[mid_ind - 1] +
sorted_nonzero[mid_ind]) / 2.
elif zero_ind + nz < mid_ind:
npz = mid_ind - nz - neg_idx.size
median[f_ind] = (pos_idx[npz - 1] + pos_idx[npz]) / 2.

return median
32 changes: 31 additions & 1 deletion sklearn/utils/tests/test_sparsefuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
inplace_row_scale,
inplace_swap_row, inplace_swap_column,
min_max_axis,
count_nonzero)
count_nonzero, csc_row_median)
from sklearn.utils.sparsefuncs_fast import assign_rows_csr
from sklearn.utils.testing import assert_raises

Expand Down Expand Up @@ -359,3 +359,33 @@ def test_count_nonzero():

assert_raises(TypeError, count_nonzero, X_csc)
assert_raises(ValueError, count_nonzero, X_csr, axis=2)


def test_csc_row_median():
"""Test csc_row_median actually calculates the median."""

# Test that it gives the same output when X is dense.
rng = np.random.RandomState(0)
X = rng.rand(100, 50)
dense_median = np.median(X, axis=0)
csc = sp.csc_matrix(X)
sparse_median = csc_row_median(csc)
assert_array_equal(sparse_median, dense_median)

# Test that it gives the same output when X is sparse
X = rng.rand(51, 100)
X[X < 0.7] = 0.0
ind = rng.randint(0, 50, 10)
X[ind] = -X[ind]
csc = sp.csc_matrix(X)
dense_median = np.median(X, axis=0)
sparse_median = csc_row_median(csc)
assert_array_equal(sparse_median, dense_median)

# Test for toy data.
X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
csc = sp.csc_matrix(X)
assert_array_equal(csc_row_median(csc), np.array([0.5, -0.5]))
X = [[0, -2], [-1, -5], [1, -3]]
csc = sp.csc_matrix(X)
assert_array_equal(csc_row_median(csc), np.array([0., -3]))

0 comments on commit 6f41bd1

Please sign in to comment.