38
loading...
This website collects cookies to deliver better user experience
Originally published at https://alexandruburlacu.github.io/posts/2021-06-18-kmeans-trick
from sklearn.cluster import KMeans
from sklearn.svm import LinearSVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)
svm = LinearSVC(random_state=17)
svm.fit(X_train, y_train)
svm.score(X_test, y_test) # should be ~0.93
K-Means can be used as a source of new features.
# imports from the example above
svm = LinearSVC(random_state=17)
kmeans = KMeans(n_clusters=3, random_state=17)
X_clusters = kmeans.fit_predict(X_train).reshape(-1, 1)
svm.fit(np.hstack([X_train, X_clusters]), y_train)
svm.score(np.hstack([X_test, kmeans.predict(X_test).reshape(-1, 1)]), y_test) # should be ~0.937
# imports from the example above
svm = LinearSVC(random_state=17)
kmeans = KMeans(n_clusters=3, random_state=17)
X_clusters = kmeans.fit_transform(X_train)
# ^^^^^^^^^
# Notice the `transform` instead of `predict`
# Scikit-learn supports this method as early as version 0.15
svm.fit(np.hstack([X_train, X_clusters]), y_train)
svm.score(np.hstack([X_test, kmeans.transform(X_test)]), y_test) # should be ~0.727
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
columns = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error',
'fractal dimension error', 'worst radius', 'worst texture',
'worst perimeter', 'worst area', 'worst smoothness',
'worst compactness', 'worst concavity', 'worst concave points',
'worst symmetry', 'worst fractal dimension',
'distance to cluster 1', 'distance to cluster 2', 'distance to cluster 3']
data = pd.DataFrame.from_records(np.hstack([X_train, X_clusters]), columns=columns)
sns.heatmap(data.corr())
plt.xticks(rotation=-45)
plt.show()
# imports from the example above
svm = LinearSVC(random_state=17)
kmeans = KMeans(n_clusters=3, random_state=17)
X_clusters = kmeans.fit_transform(X_train)
svm.fit(X_clusters, y_train)
svm.score(kmeans.transform(X_test), y_test) # should be ~0.951
K-Means can be used as a substitute for the kernel trick
# imports from the example above
svm = LinearSVC(random_state=17)
kmeans = KMeans(n_clusters=250, random_state=17)
X_clusters = kmeans.fit_transform(X_train)
svm.fit(X_clusters, y_train)
svm.score(kmeans.transform(X_test), y_test) # should be ~0.944
n_clusters=1000
and it worked better than only with a few clusters.import matplotlib.pyplot as plt
import numpy as np
from time import time
from sklearn.datasets import load_breast_cancer
from sklearn.svm import LinearSVC, SVC
from sklearn import pipeline
from sklearn.kernel_approximation import RBFSampler, Nystroem, PolynomialCountSketch
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import MiniBatchKMeans
mm = pipeline.make_pipeline(MinMaxScaler(), Normalizer())
X, y = load_breast_cancer(return_X_y=True)
X = mm.fit_transform(X)
data_train, data_test, targets_train, targets_test = train_test_split(X, y, random_state=17)
# Create a classifier: a support vector classifier
kernel_svm = SVC(gamma=.2, random_state=17)
linear_svm = LinearSVC(random_state=17)
# create pipeline from kernel approximation and linear svm
feature_map_fourier = RBFSampler(gamma=.2, random_state=17)
feature_map_nystroem = Nystroem(gamma=.2, random_state=17)
feature_map_poly_cm = PolynomialCountSketch(degree=4, random_state=17)
feature_map_kmeans = MiniBatchKMeans(random_state=17)
fourier_approx_svm = pipeline.Pipeline([("feature_map", feature_map_fourier),
("svm", LinearSVC(random_state=17))])
nystroem_approx_svm = pipeline.Pipeline([("feature_map", feature_map_nystroem),
("svm", LinearSVC(random_state=17))])
poly_cm_approx_svm = pipeline.Pipeline([("feature_map", feature_map_poly_cm),
("svm", LinearSVC(random_state=17))])
kmeans_approx_svm = pipeline.Pipeline([("feature_map", feature_map_kmeans),
("svm", LinearSVC(random_state=17))])
# fit and predict using linear and kernel svm:
kernel_svm_time = time()
kernel_svm.fit(data_train, targets_train)
kernel_svm_score = kernel_svm.score(data_test, targets_test)
kernel_svm_time = time() - kernel_svm_time
linear_svm_time = time()
linear_svm.fit(data_train, targets_train)
linear_svm_score = linear_svm.score(data_test, targets_test)
linear_svm_time = time() - linear_svm_time
sample_sizes = 30 * np.arange(1, 10)
fourier_scores = []
nystroem_scores = []
poly_cm_scores = []
kmeans_scores = []
fourier_times = []
nystroem_times = []
poly_cm_times = []
kmeans_times = []
for D in sample_sizes:
fourier_approx_svm.set_params(feature_map__n_components=D)
nystroem_approx_svm.set_params(feature_map__n_components=D)
poly_cm_approx_svm.set_params(feature_map__n_components=D)
kmeans_approx_svm.set_params(feature_map__n_clusters=D)
start = time()
nystroem_approx_svm.fit(data_train, targets_train)
nystroem_times.append(time() - start)
start = time()
fourier_approx_svm.fit(data_train, targets_train)
fourier_times.append(time() - start)
start = time()
poly_cm_approx_svm.fit(data_train, targets_train)
poly_cm_times.append(time() - start)
start = time()
kmeans_approx_svm.fit(data_train, targets_train)
kmeans_times.append(time() - start)
fourier_score = fourier_approx_svm.score(data_test, targets_test)
fourier_scores.append(fourier_score)
nystroem_score = nystroem_approx_svm.score(data_test, targets_test)
nystroem_scores.append(nystroem_score)
poly_cm_score = poly_cm_approx_svm.score(data_test, targets_test)
poly_cm_scores.append(poly_cm_score)
kmeans_score = kmeans_approx_svm.score(data_test, targets_test)
kmeans_scores.append(kmeans_score)
plt.figure(figsize=(16, 4))
accuracy = plt.subplot(211)
timescale = plt.subplot(212)
accuracy.plot(sample_sizes, nystroem_scores, label="Nystroem approx. kernel")
timescale.plot(sample_sizes, nystroem_times, '--',
label='Nystroem approx. kernel')
accuracy.plot(sample_sizes, fourier_scores, label="Fourier approx. kernel")
timescale.plot(sample_sizes, fourier_times, '--',
label='Fourier approx. kernel')
accuracy.plot(sample_sizes, poly_cm_scores, label="Polynomial Count-Min approx. kernel")
timescale.plot(sample_sizes, poly_cm_times, '--',
label='Polynomial Count-Min approx. kernel')
accuracy.plot(sample_sizes, kmeans_scores, label="K-Means approx. kernel")
timescale.plot(sample_sizes, kmeans_times, '--',
label='K-Means approx. kernel')
# horizontal lines for exact rbf and linear kernels:
accuracy.plot([sample_sizes[0], sample_sizes[-1]],
[linear_svm_score, linear_svm_score], label="linear svm")
timescale.plot([sample_sizes[0], sample_sizes[-1]],
[linear_svm_time, linear_svm_time], '--', label='linear svm')
accuracy.plot([sample_sizes[0], sample_sizes[-1]],
[kernel_svm_score, kernel_svm_score], label="rbf svm")
timescale.plot([sample_sizes[0], sample_sizes[-1]],
[kernel_svm_time, kernel_svm_time], '--', label='rbf svm')
# legends and labels
accuracy.set_title("Classification accuracy")
timescale.set_title("Training times")
accuracy.set_xlim(sample_sizes[0], sample_sizes[-1])
accuracy.set_xticks(())
accuracy.set_ylim(np.min(fourier_scores), 1)
timescale.set_xlabel("Sampling steps = transformed feature dimension")
accuracy.set_ylabel("Classification accuracy")
timescale.set_ylabel("Training time in seconds")
accuracy.legend(loc='best')
timescale.legend(loc='best')
plt.tight_layout()
plt.show()
MiniBatchKMeans
which is one of the few algorithms that support the .partial_fit
method. Combining this with a machine learning model that has .partial_fit
too, like a PassiveAggressiveClassifier
one can create a pretty interesting solution..partial_fit
is twofold. First, it makes it possible to train algorithms in an out-of-core fashion, which is to say, with more data than fits in the RAM. Second, depending on your type of problem, if you could in principle (very-very in principle) never need to switch the model, it could be additionally trained right where it is deployed. That's called online learning, and it's super interesting. Something like this is what some Chinese companies are doing and in general can be pretty useful for AdTech, because you can receive the info whenever your ad recommendation was right or wrong within seconds.from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import numpy as np
def batch(iterable, n=1):
# source: https://stackoverflow.com/a/8290508/5428334
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)
kmeans = MiniBatchKMeans(n_clusters=100, random_state=17) # K-Means has a constraint, n_clusters <= n_samples to fit
pac = PassiveAggressiveClassifier(random_state=17)
for x, y in zip(batch(X_train, n=100), batch(y_train, n=100)):
kmeans.partial_fit(x, y) # fit K-Means a bit
x_dist = kmeans.transform(x) # obtain distances
pac.partial_fit(x_dist, y, classes=[0, 1]) # learn a bit the classifier, we need to indicate the classes
print(pac.score(kmeans.transform(X_test), y_test))
# 0.909 after 100 samples
# 0.951 after 200 samples
# 0.951 after 300 samples
# 0.944 after 400 samples
# 0.902 after 426 samples
# VS
kmeans = MiniBatchKMeans(n_clusters=100, random_state=17)
pac = PassiveAggressiveClassifier(random_state=17)
pac.fit(kmeans.fit_transform(X_train), y_train)
pac.score(kmeans.transform(X_test), y_test)
# should be ~0.951
random_state
s in the code. If you're wondering why I added these, it's to make the code samples reproducible. Because frequently tutorials don't do this and it leaves space for cherry-picking, where the author presents only the best results, and when trying to replicate these, the reader either can't or it takes a lot of time. But know this, you can play around with the values of random_state
and get widely different results. For example, when running the snippet with original features and distances to the 3 centroids, the one with a 0.727 score, with a random seed of 41 instead of 17, you can get the accuracy score of 0.944. So yeah, random_state
or however else the random seed is called in your framework of choice is an important aspect to keep in mind, especially when doing research.