ENH: Patches Nearest Centroid for metric=manhattan for sparse and den…

…se data
TamChen · Oct 21, 2014 · 6f41bd1 · 6f41bd1
1 parent 535d1f6
commit 6f41bd1
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 6 deletions.
diff --git a/sklearn/neighbors/nearest_centroid.py b/sklearn/neighbors/nearest_centroid.py
@@ -16,6 +16,7 @@
 from ..metrics.pairwise import pairwise_distances
 from ..preprocessing import LabelEncoder
 from ..utils.validation import check_array, check_X_y
+from ..utils.sparsefuncs import csc_row_median
 
 
 class NearestCentroid(BaseEstimator, ClassifierMixin):
@@ -86,8 +87,9 @@ def fit(self, X, y):
         y : array, shape = [n_samples]
             Target values (integers)
         """
-        X, y = check_X_y(X, y, ['csr', 'csc'])
-        if sp.issparse(X) and self.shrink_threshold:
+        X, y = check_X_y(X, y, ['csc'])
+        X_sparse = sp.issparse(X)
+        if X_sparse and self.shrink_threshold:
             raise ValueError("threshold shrinking not supported"
                              " for sparse input")
 
@@ -100,16 +102,25 @@ def fit(self, X, y):
             raise ValueError('y has less than 2 classes')
 
         # Mask mapping each class to it's members.
-        self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)
+        self.centroids_ = np.zeros((n_classes, n_features), dtype=np.float64)
         # Number of clusters in each class.
         nk = np.zeros(n_classes)
 
         for cur_class in y_ind:
             center_mask = y_ind == cur_class
             nk[cur_class] = np.sum(center_mask)
-            if sp.issparse(X):
+            if X_sparse:
                 center_mask = np.where(center_mask)[0]
-            self.centroids_[cur_class] = X[center_mask].mean(axis=0)
+
+            # XXX: Update other averaging methods according to the metrics.
+            if self.metric == "manhattan":
+                # NumPy does not calculate median of sparse matrices.
+                if not X_sparse:
+                    self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
+                else:
+                    self.centroids_[cur_class] = csc_row_median(X[center_mask])
+            else:
+                self.centroids_[cur_class] = X[center_mask].mean(axis=0)
 
         if self.shrink_threshold:
             dataset_centroid_ = np.mean(X, axis=0)

diff --git a/sklearn/neighbors/tests/test_nearest_centroid.py b/sklearn/neighbors/tests/test_nearest_centroid.py
@@ -125,6 +125,17 @@ def test_predict_translated_data():
     assert_array_equal(y_init, y_translate)
 
 
+def test_manhattan_metric():
+    """Test the manhattan metric."""
+
+    clf = NearestCentroid(metric='manhattan')
+    clf.fit(X, y)
+    dense_centroid = clf.centroids_
+    clf.fit(X_csr, y)
+    assert_array_equal(clf.centroids_, dense_centroid)
+    assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
+
+
 if __name__ == "__main__":
     import nose
     nose.runmodule()
diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py
@@ -4,6 +4,7 @@
 # License: BSD 3 clause
 import scipy.sparse as sp
 import numpy as np
+import warnings
 
 from .fixes import sparse_min_max
 from .sparsefuncs_fast import csr_mean_variance_axis0 as _csr_mean_var_axis0
@@ -342,3 +343,73 @@ def count_nonzero(X, axis=None, sample_weight=None):
                                weights=weights)
     else:
         raise ValueError('Unsupported axis: {0}'.format(axis))
+
+
+def csc_row_median(csc):
+    """
+    Find the median across axis 0 of a CSC matrix.
+    Equivalent to doing np.median(X, axis=0)
+
+    Parameters
+    ----------
+    csc : CSC sparse matrix, shape (n_samples, n_features)
+        Input data.
+
+    Returns
+    -------
+    median : ndarray, shape = (n_features,)
+        Median. 
+
+    """
+    if not isinstance(csc, sp.csc_matrix):
+        warnings.warn("Non CSC matix passed. Will convert to CSC format.")
+        csc = sp.csc_matrix(csc)
+
+    indptr = csc.indptr
+    n_samples, n_features = csc.shape
+
+    # Highly likely that the median of a sparse matrix is zero.
+    # Remains zero if the if/else conditions are not checked below.
+    median = np.zeros(n_features)
+
+    for f_ind, ptr in enumerate(indptr[:-1]):
+        sorted_nonzero = np.sort(csc.data[ptr: indptr[f_ind + 1]])
+        nz = n_samples - sorted_nonzero.size
+        zero_ind = np.searchsorted(sorted_nonzero, 0)
+        neg_idx = sorted_nonzero[: zero_ind]
+        pos_idx = sorted_nonzero[zero_ind: ]
+        odd = n_samples % 2
+        mid_ind = n_samples // 2
+
+        if odd:
+            # Number of negative terms is greater then (n_features + 1) / 2
+            # which implies the median is negative.
+            if zero_ind > mid_ind:
+                median[f_ind] = sorted_nonzero[mid_ind]
+
+            # The sum of the negative terms and the number of zeros is less
+            # than the (n_features + 1) / 2 which implies the median is positive.
+            elif zero_ind + nz <= mid_ind:
+                median[f_ind] = pos_idx[mid_ind - nz - neg_idx.size]
+
+        else:
+            # The first two conditions are highly unlikely.
+            # When the n_features / 2 is the last negative term and
+            # (n_features / 2) + 1 is zero.
+            if zero_ind == mid_ind:
+                median[f_ind] = neg_idx[-1] / 2.
+
+            # When the n_features / 2 is zero and (n_features / 2) + 1
+            # is the first positive term.
+            elif neg_idx.size + nz == mid_ind:
+                median[f_ind] = pos_idx[0] / 2.
+
+            # Same comments as for the odd case.
+            elif zero_ind > mid_ind:
+                median[f_ind] = (sorted_nonzero[mid_ind - 1] +
+                                 sorted_nonzero[mid_ind]) / 2.
+            elif zero_ind + nz < mid_ind:
+                npz = mid_ind - nz - neg_idx.size
+                median[f_ind] = (pos_idx[npz - 1] + pos_idx[npz]) / 2.
+
+    return median
diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py
@@ -10,7 +10,7 @@
                                        inplace_row_scale,
                                        inplace_swap_row, inplace_swap_column,
                                        min_max_axis,
-                                       count_nonzero)
+                                       count_nonzero, csc_row_median)
 from sklearn.utils.sparsefuncs_fast import assign_rows_csr
 from sklearn.utils.testing import assert_raises
 
@@ -359,3 +359,33 @@ def test_count_nonzero():
 
     assert_raises(TypeError, count_nonzero, X_csc)
     assert_raises(ValueError, count_nonzero, X_csr, axis=2)
+
+
+def test_csc_row_median():
+    """Test csc_row_median actually calculates the median."""
+
+    # Test that it gives the same output when X is dense.
+    rng = np.random.RandomState(0)
+    X = rng.rand(100, 50)
+    dense_median = np.median(X, axis=0)
+    csc = sp.csc_matrix(X)
+    sparse_median = csc_row_median(csc)
+    assert_array_equal(sparse_median, dense_median)
+
+    # Test that it gives the same output when X is sparse
+    X = rng.rand(51, 100)
+    X[X < 0.7] = 0.0
+    ind = rng.randint(0, 50, 10)
+    X[ind] = -X[ind]
+    csc = sp.csc_matrix(X)
+    dense_median = np.median(X, axis=0)
+    sparse_median = csc_row_median(csc)
+    assert_array_equal(sparse_median, dense_median)
+
+    # Test for toy data.
+    X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
+    csc = sp.csc_matrix(X)
+    assert_array_equal(csc_row_median(csc), np.array([0.5, -0.5]))
+    X = [[0, -2], [-1, -5], [1, -3]]
+    csc = sp.csc_matrix(X)
+    assert_array_equal(csc_row_median(csc), np.array([0., -3]))