"""k-means-constrained"""
# Authors: Josh Levy-Kramer <josh@levykramer.co.uk>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Thomas Rueckstiess <ruecksti@in.tum.de>
# James Bergstra <james.bergstra@umontreal.ca>
# Jan Schlueter <scikit-learn@jan-schlueter.de>
# Nelle Varoquaux
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Olivier Grisel <olivier.grisel@ensta.org>
# Mathieu Blondel <mathieu@mblondel.org>
# Robert Layton <robertlayton@gmail.com>
# License: BSD 3 clause
import warnings
import numpy as np
import scipy.sparse as sp
from .sklearn_import.metrics.pairwise import euclidean_distances
from .sklearn_import.utils.extmath import row_norms, squared_norm, cartesian
from .sklearn_import.utils.validation import check_array, check_random_state, as_float_array, check_is_fitted
from joblib import Parallel
from joblib import delayed
# Internal scikit learn methods imported into this project
from k_means_constrained.sklearn_import.cluster._k_means import _centers_dense, _centers_sparse
from k_means_constrained.sklearn_import.cluster.k_means_ import _validate_center_shape, _tolerance, KMeans, \
_init_centroids
from k_means_constrained.mincostflow_vectorized import SimpleMinCostFlowVectorized
def k_means_constrained(X, n_clusters, size_min=None, size_max=None, init='k-means++',
n_init=10, max_iter=300, verbose=False,
tol=1e-4, random_state=None, copy_x=True, n_jobs=1,
return_n_iter=False):
"""K-Means clustering with minimum and maximum cluster size constraints.
Read more in the :ref:`User Guide <k_means>`.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The observations to cluster.
size_min : int, optional, default: None
Constrain the label assignment so that each cluster has a minimum
size of size_min. If None, no constrains will be applied
size_max : int, optional, default: None
Constrain the label assignment so that each cluster has a maximum
size of size_max. If None, no constrains will be applied
n_clusters : int
The number of clusters to form as well as the number of
centroids to generate.
init : {'k-means++', 'random', or ndarray, or a callable}, optional
Method for initialization, default to 'k-means++':
'k-means++' : selects initial cluster centers for k-mean
clustering in a smart way to speed up convergence. See section
Notes in k_init for more details.
'random': generate k centroids from a Gaussian with mean and
variance estimated from the data.
If an ndarray is passed, it should be of shape (n_clusters, n_features)
and gives the initial centers.
If a callable is passed, it should take arguments X, k and
and a random state and return an initialization.
n_init : int, optional, default: 10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of
n_init consecutive runs in terms of inertia.
max_iter : int, optional, default 300
Maximum number of iterations of the k-means algorithm to run.
verbose : boolean, optional
Verbosity mode.
tol : float, optional
Relative tolerance with regards to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence.
random_state : int, RandomState instance or None, optional, default: None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
copy_x : boolean, optional
When pre-computing distances it is more numerically accurate to center
the data first. If copy_x is True, then the original data is not
modified. If False, the original data is modified, and put back before
the function returns, but small numerical differences may be introduced
by subtracting and then adding the data mean.
n_jobs : int
The number of jobs to use for the computation. This works by computing
each of the n_init runs in parallel.
If -1 all CPUs are used. If 1 is given, no parallel computing code is
used at all, which is useful for debugging. For n_jobs below -1,
(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
are used.
return_n_iter : bool, optional
Whether or not to return the number of iterations.
Returns
-------
centroid : float ndarray with shape (k, n_features)
Centroids found at the last iteration of k-means.
label : integer ndarray with shape (n_samples,)
label[i] is the code or index of the centroid the
i'th observation is closest to.
inertia : float
The final value of the inertia criterion (sum of squared distances to
the closest centroid for all observations in the training set).
best_n_iter : int
Number of iterations corresponding to the best results.
Returned only if `return_n_iter` is set to True.
"""
if sp.issparse(X):
raise NotImplementedError("Not implemented for sparse X")
if n_init <= 0:
raise ValueError("Invalid number of initializations."
" n_init=%d must be bigger than zero." % n_init)
random_state = check_random_state(random_state)
if max_iter <= 0:
raise ValueError('Number of iterations should be a positive number,'
' got %d instead' % max_iter)
X = as_float_array(X, copy=copy_x)
tol = _tolerance(X, tol)
# Validate init array
if hasattr(init, '__array__'):
init = check_array(init, dtype=X.dtype.type, copy=True)
_validate_center_shape(X, n_clusters, init)
if n_init != 1:
warnings.warn(
'Explicit initial center position passed: '
'performing only one init in k-means instead of n_init=%d'
% n_init, RuntimeWarning, stacklevel=2)
n_init = 1
# subtract of mean of x for more accurate distance computations
if not sp.issparse(X):
X_mean = X.mean(axis=0)
# The copy was already done above
X -= X_mean
if hasattr(init, '__array__'):
init -= X_mean
# precompute squared norms of data points
x_squared_norms = row_norms(X, squared=True)
best_labels, best_inertia, best_centers = None, None, None
if n_jobs == 1:
# For a single thread, less memory is needed if we just store one set
# of the best results (as opposed to one set per run per thread).
for it in range(n_init):
# run a k-means once
labels, inertia, centers, n_iter_ = kmeans_constrained_single(
X, n_clusters,
size_min=size_min, size_max=size_max,
max_iter=max_iter, init=init, verbose=verbose, tol=tol,
x_squared_norms=x_squared_norms, random_state=random_state)
# determine if these results are the best so far
if best_inertia is None or inertia < best_inertia:
best_labels = labels.copy()
best_centers = centers.copy()
best_inertia = inertia
best_n_iter = n_iter_
else:
# parallelisation of k-means runs
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
results = Parallel(n_jobs=n_jobs, verbose=0)(
delayed(kmeans_constrained_single)(X, n_clusters,
size_min=size_min, size_max=size_max,
max_iter=max_iter, init=init,
verbose=verbose, tol=tol,
x_squared_norms=x_squared_norms,
# Change seed to ensure variety
random_state=seed)
for seed in seeds)
# Get results with the lowest inertia
labels, inertia, centers, n_iters = zip(*results)
best = np.argmin(inertia)
best_labels = labels[best]
best_inertia = inertia[best]
best_centers = centers[best]
best_n_iter = n_iters[best]
if not sp.issparse(X):
if not copy_x:
X += X_mean
best_centers += X_mean
if return_n_iter:
return best_centers, best_labels, best_inertia, best_n_iter
else:
return best_centers, best_labels, best_inertia
def kmeans_constrained_single(X, n_clusters, size_min=None, size_max=None,
max_iter=300, init='k-means++',
verbose=False, x_squared_norms=None,
random_state=None, tol=1e-4):
"""A single run of k-means constrained, assumes preparation completed prior.
Parameters
----------
X : array-like of floats, shape (n_samples, n_features)
The observations to cluster.
size_min : int, optional, default: None
Constrain the label assignment so that each cluster has a minimum
size of size_min. If None, no constrains will be applied
size_max : int, optional, default: None
Constrain the label assignment so that each cluster has a maximum
size of size_max. If None, no constrains will be applied
n_clusters : int
The number of clusters to form as well as the number of
centroids to generate.
max_iter : int, optional, default 300
Maximum number of iterations of the k-means algorithm to run.
init : {'k-means++', 'random', or ndarray, or a callable}, optional
Method for initialization, default to 'k-means++':
'k-means++' : selects initial cluster centers for k-mean
clustering in a smart way to speed up convergence. See section
Notes in k_init for more details.
'random': generate k centroids from a Gaussian with mean and
variance estimated from the data.
If an ndarray is passed, it should be of shape (k, p) and gives
the initial centers.
If a callable is passed, it should take arguments X, k and
and a random state and return an initialization.
tol : float, optional
Relative tolerance with regards to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence.
verbose : boolean, optional
Verbosity mode
x_squared_norms : array
Precomputed x_squared_norms.
random_state : int, RandomState instance or None, optional, default: None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
-------
centroid : float ndarray with shape (k, n_features)
Centroids found at the last iteration of k-means.
label : integer ndarray with shape (n_samples,)
label[i] is the code or index of the centroid the
i'th observation is closest to.
inertia : float
The final value of the inertia criterion (sum of squared distances to
the closest centroid for all observations in the training set).
n_iter : int
Number of iterations run.
"""
if sp.issparse(X):
raise NotImplementedError("Not implemented for sparse X")
random_state = check_random_state(random_state)
n_samples = X.shape[0]
best_labels, best_inertia, best_centers = None, None, None
# init
centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms)
if verbose:
print("Initialization complete")
# Allocate memory to store the distances for each sample to its
# closer center for reallocation in case of ties
distances = np.zeros(shape=(n_samples,), dtype=X.dtype)
# Determine min and max sizes if non given
if size_min is None:
size_min = 0
if size_max is None:
size_max = n_samples # Number of data points
# Check size min and max
if not ((size_min >= 0) and (size_min <= n_samples)
and (size_max >= 0) and (size_max <= n_samples)):
raise ValueError("size_min and size_max must be a positive number smaller "
"than the number of data points or `None`")
if size_max < size_min:
raise ValueError("size_max must be larger than size_min")
if size_min * n_clusters > n_samples:
raise ValueError("The product of size_min and n_clusters cannot exceed the number of samples (X)")
if size_max * n_clusters < n_samples:
raise ValueError("The product of size_max and n_clusters must be larger than or equal the number of samples (X)")
# iterations
for i in range(max_iter):
centers_old = centers.copy()
# labels assignment is also called the E-step of EM
labels, inertia = \
_labels_constrained(X, centers, size_min, size_max, distances=distances)
# computation of the means is also called the M-step of EM
if sp.issparse(X):
centers = _centers_sparse(X, labels, n_clusters, distances)
else:
centers = _centers_dense(X, labels, n_clusters, distances)
if verbose:
print("Iteration %2d, inertia %.3f" % (i, inertia))
if best_inertia is None or inertia < best_inertia:
best_labels = labels.copy()
best_centers = centers.copy()
best_inertia = inertia
center_shift_total = squared_norm(centers_old - centers)
if center_shift_total <= tol:
if verbose:
print("Converged at iteration %d: "
"center shift %e within tolerance %e"
% (i, center_shift_total, tol))
break
if center_shift_total > 0:
# rerun E-step in case of non-convergence so that predicted labels
# match cluster centers
best_labels, best_inertia = \
_labels_constrained(X, centers, size_min, size_max, distances=distances)
return best_labels, best_inertia, best_centers, i + 1
def _labels_constrained(X, centers, size_min, size_max, distances):
"""Compute labels using the min and max cluster size constraint
This will overwrite the 'distances' array in-place.
Parameters
----------
X : numpy array, shape (n_sample, n_features)
Input data.
size_min : int
Minimum size for each cluster
size_max : int
Maximum size for each cluster
centers : numpy array, shape (n_clusters, n_features)
Cluster centers which data is assigned to.
distances : numpy array, shape (n_samples,)
Pre-allocated array in which distances are stored.
Returns
-------
labels : numpy array, dtype=np.int, shape (n_samples,)
Indices of clusters that samples are assigned to.
inertia : float
Sum of squared distances of samples to their closest cluster center.
"""
C = centers
# Distances to each centre C. (the `distances` parameter is the distance to the closest centre)
# K-mean original uses squared distances but this equivalent for constrained k-means
D = euclidean_distances(X, C, squared=False)
edges, costs, capacities, supplies, n_C, n_X = minimum_cost_flow_problem_graph(X, C, D, size_min, size_max)
labels = solve_min_cost_flow_graph(edges, costs, capacities, supplies, n_C, n_X)
# cython k-means M step code assumes int32 inputs
labels = labels.astype(np.int32)
# Change distances in-place
distances[:] = D[np.arange(D.shape[0]), labels] ** 2 # Square for M step of EM
inertia = distances.sum()
return labels, inertia
def minimum_cost_flow_problem_graph(X, C, D, size_min, size_max):
# Setup minimum cost flow formulation graph
# Vertices indexes:
# X-nodes: [0, n(x)-1], C-nodes: [n(X), n(X)+n(C)-1], C-dummy nodes:[n(X)+n(C), n(X)+2*n(C)-1],
# Artificial node: [n(X)+2*n(C), n(X)+2*n(C)+1-1]
# Create indices of nodes
n_X = X.shape[0]
n_C = C.shape[0]
X_ix = np.arange(n_X)
C_dummy_ix = np.arange(X_ix[-1] + 1, X_ix[-1] + 1 + n_C)
C_ix = np.arange(C_dummy_ix[-1] + 1, C_dummy_ix[-1] + 1 + n_C)
art_ix = C_ix[-1] + 1
# Edges
edges_X_C_dummy = cartesian([X_ix, C_dummy_ix]) # All X's connect to all C dummy nodes (C')
edges_C_dummy_C = np.stack([C_dummy_ix, C_ix], axis=1) # Each C' connects to a corresponding C (centroid)
edges_C_art = np.stack([C_ix, art_ix * np.ones(n_C)], axis=1) # All C connect to artificial node
edges = np.concatenate([edges_X_C_dummy, edges_C_dummy_C, edges_C_art])
# Costs
costs_X_C_dummy = D.reshape(D.size)
costs = np.concatenate([costs_X_C_dummy, np.zeros(edges.shape[0] - len(costs_X_C_dummy))])
# Capacities - can set for max-k
capacities_C_dummy_C = size_max * np.ones(n_C)
cap_non = n_X # The total supply and therefore wont restrict flow
capacities = np.concatenate([
np.ones(edges_X_C_dummy.shape[0]),
capacities_C_dummy_C,
cap_non * np.ones(n_C)
])
# Sources and sinks
supplies_X = np.ones(n_X)
supplies_C = -1 * size_min * np.ones(n_C) # Demand node
supplies_art = -1 * (n_X - n_C * size_min) # Demand node
supplies = np.concatenate([
supplies_X,
np.zeros(n_C), # C_dummies
supplies_C,
[supplies_art]
])
# All arrays must be of int dtype for `SimpleMinCostFlow`
edges = edges.astype('int32')
costs = np.around(costs * 1000, 0).astype('int32') # Times by 1000 to give extra precision
capacities = capacities.astype('int32')
supplies = supplies.astype('int32')
return edges, costs, capacities, supplies, n_C, n_X
def solve_min_cost_flow_graph(edges, costs, capacities, supplies, n_C, n_X):
# Instantiate a SimpleMinCostFlow solver.
min_cost_flow = SimpleMinCostFlowVectorized()
if (edges.dtype != 'int32') or (costs.dtype != 'int32') \
or (capacities.dtype != 'int32') or (supplies.dtype != 'int32'):
raise ValueError("`edges`, `costs`, `capacities`, `supplies` must all be int dtype")
N_edges = edges.shape[0]
N_nodes = len(supplies)
# Add each edge with associated capacities and cost
min_cost_flow.AddArcWithCapacityAndUnitCostVectorized(edges[:, 0], edges[:, 1], capacities, costs)
# Add node supplies
min_cost_flow.SetNodeSupplyVectorized(np.arange(N_nodes, dtype='int32'), supplies)
# Find the minimum cost flow between node 0 and node 4.
if min_cost_flow.Solve() != min_cost_flow.OPTIMAL:
raise Exception('There was an issue with the min cost flow input.')
# Assignment
labels_M = min_cost_flow.FlowVectorized(np.arange(n_X * n_C, dtype='int32')).reshape(n_X, n_C)
labels = labels_M.argmax(axis=1)
return labels
[docs]class KMeansConstrained(KMeans):
"""K-Means clustering with minimum and maximum cluster size constraints
Parameters
----------
n_clusters : int, optional, default: 8
The number of clusters to form as well as the number of
centroids to generate.
size_min : int, optional, default: None
Constrain the label assignment so that each cluster has a minimum
size of size_min. If None, no constrains will be applied
size_max : int, optional, default: None
Constrain the label assignment so that each cluster has a maximum
size of size_max. If None, no constrains will be applied
init : {'k-means++', 'random' or an ndarray}
Method for initialization, defaults to 'k-means++':
'k-means++' : selects initial cluster centers for k-mean
clustering in a smart way to speed up convergence. See section
Notes in k_init for more details.
'random': choose k observations (rows) at random from data for
the initial centroids.
If an ndarray is passed, it should be of shape (n_clusters, n_features)
and gives the initial centers.
n_init : int, default: 10
Number of times the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of
n_init consecutive runs in terms of inertia.
max_iter : int, default: 300
Maximum number of iterations of the k-means algorithm for a
single run.
tol : float, default: 1e-4
Relative tolerance with regards to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence.
verbose : int, default 0
Verbosity mode.
random_state : int, RandomState instance or None, optional, default: None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
copy_x : boolean, default True
When pre-computing distances it is more numerically accurate to center
the data first. If copy_x is True, then the original data is not
modified. If False, the original data is modified, and put back before
the function returns, but small numerical differences may be introduced
by subtracting and then adding the data mean.
n_jobs : int
The number of jobs to use for the computation. This works by computing
each of the n_init runs in parallel.
If -1 all CPUs are used. If 1 is given, no parallel computing code is
used at all, which is useful for debugging. For n_jobs below -1,
(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
are used.
Attributes
----------
cluster_centers_ : array, [n_clusters, n_features]
Coordinates of cluster centers
labels_ :
Labels of each point
inertia_ : float
Sum of squared distances of samples to their closest cluster center.
Examples
--------
>>> from k_means_constrained import KMeansConstrained
>>> import numpy as np
>>> X = np.array([[1, 2], [1, 4], [1, 0],
... [4, 2], [4, 4], [4, 0]])
>>> clf = KMeansConstrained(
... n_clusters=2,
... size_min=2,
... size_max=5,
... random_state=0
... )
>>> clf.fit_predict(X)
array([0, 0, 0, 1, 1, 1], dtype=int32)
>>> clf.cluster_centers_
array([[ 1., 2.],
[ 4., 2.]])
>>> clf.labels_
array([0, 0, 0, 1, 1, 1], dtype=int32)
Notes
------
K-means problem constrained with a minimum and/or maximum size for each cluster.
The constrained assignment is formulated as a Minimum Cost Flow (MCF) linear network optimisation
problem. This is then solved using a cost-scaling push-relabel algorithm. The implementation used is
Google's Operations Research tools's `SimpleMinCostFlow`.
Ref:
1. Bradley, P. S., K. P. Bennett, and Ayhan Demiriz. "Constrained k-means clustering."
Microsoft Research, Redmond (2000): 1-8.
2. Google's SimpleMinCostFlow implementation:
https://github.com/google/or-tools/blob/master/ortools/graph/min_cost_flow.h
"""
def __init__(self, n_clusters=8, size_min=None, size_max=None, init='k-means++', n_init=10, max_iter=300, tol=1e-4,
verbose=False, random_state=None, copy_x=True, n_jobs=1):
self.size_min = size_min
self.size_max = size_max
super().__init__(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol,
verbose=verbose, random_state=random_state, copy_x=copy_x, n_jobs=n_jobs)
[docs] def fit(self, X, y=None):
"""Compute k-means clustering with given constants.
Parameters
----------
X : array-like, shape=(n_samples, n_features)
Training instances to cluster.
y : Ignored
"""
if sp.issparse(X):
raise NotImplementedError("Not implemented for sparse X")
random_state = check_random_state(self.random_state)
X = self._check_fit_data(X)
self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
k_means_constrained(
X, n_clusters=self.n_clusters,
size_min=self.size_min, size_max=self.size_max,
init=self.init,
n_init=self.n_init, max_iter=self.max_iter, verbose=self.verbose,
tol=self.tol, random_state=random_state, copy_x=self.copy_x,
n_jobs=self.n_jobs,
return_n_iter=True)
return self
[docs] def predict(self, X, size_min='init', size_max='init'):
"""
Predict the closest cluster each sample in X belongs to given the provided constraints.
The constraints can be temporally overridden when determining which cluster each datapoint is assigned to.
Only computes the assignment step. It does not re-fit the cluster positions.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
New data to predict.
size_min : int, optional, default: size_min provided with initialisation
Constrain the label assignment so that each cluster has a minimum
size of size_min. If None, no constrains will be applied.
If 'init' the value provided during initialisation of the
class will be used.
size_max : int, optional, default: size_max provided with initialisation
Constrain the label assignment so that each cluster has a maximum
size of size_max. If None, no constrains will be applied.
If 'init' the value provided during initialisation of the
class will be used.
Returns
-------
labels : array, shape [n_samples,]
Index of the cluster each sample belongs to.
"""
if sp.issparse(X):
raise NotImplementedError("Not implemented for sparse X")
if size_min == 'init':
size_min = self.size_min
if size_max == 'init':
size_max = self.size_max
n_clusters = self.n_clusters
n_samples = X.shape[0]
check_is_fitted(self, 'cluster_centers_')
X = self._check_test_data(X)
# Allocate memory to store the distances for each sample to its
# closer center for reallocation in case of ties
distances = np.zeros(shape=(n_samples,), dtype=X.dtype)
# Determine min and max sizes if non given
if size_min is None:
size_min = 0
if size_max is None:
size_max = n_samples # Number of data points
# Check size min and max
if not ((size_min >= 0) and (size_min <= n_samples)
and (size_max >= 0) and (size_max <= n_samples)):
raise ValueError("size_min and size_max must be a positive number smaller "
"than the number of data points or `None`")
if size_max < size_min:
raise ValueError("size_max must be larger than size_min")
if size_min * n_clusters > n_samples:
raise ValueError("The product of size_min and n_clusters cannot exceed the number of samples (X)")
labels, inertia = \
_labels_constrained(X, self.cluster_centers_, size_min, size_max, distances=distances)
return labels
[docs] def fit_predict(self, X, y=None):
"""Compute cluster centers and predict cluster index for each sample.
Equivalent to calling fit(X) followed by predict(X) but also more efficient.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
New data to transform.
Returns
-------
labels : array, shape [n_samples,]
Index of the cluster each sample belongs to.
"""
return self.fit(X).labels_