Spaces:

sklearn-docs
/

plot-k-means-digits

Sleeping

App Files Files Community

rashmi commited on Apr 6, 2023

Commit

c487a91

1 Parent(s): 8abd0e1

Create app.py

Browse files

Files changed (1) hide show

app.py +183 -0

app.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+from time import time
+from sklearn import metrics
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+from huggingface_hub import login
+from datasets import load_dataset
+import matplotlib.pyplot as plt
+# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py
+def display_plot(data, n_digits):
+    reduced_data = PCA(n_components=2).fit_transform(data)
+    kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4)
+    kmeans.fit(reduced_data)
+    # Step size of the mesh. Decrease to increase the quality of the VQ.
+    h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].
+    # Plot the decision boundary. For that, we will assign a color to each
+    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
+    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
+    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
+    # Obtain labels for each point in mesh. Use last trained model.
+    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
+    # Put the result into a color plot
+    Z = Z.reshape(xx.shape)
+    fig = plt.figure()
+    plt.clf()
+    plt.imshow(
+        Z,
+        interpolation="nearest",
+        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+        cmap=plt.cm.Paired,
+        aspect="auto",
+        origin="lower",
+    )
+    plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
+    # Plot the centroids as a white X
+    centroids = kmeans.cluster_centers_
+    plt.scatter(
+        centroids[:, 0],
+        centroids[:, 1],
+        marker="x",
+        s=169,
+        linewidths=3,
+        color="w",
+        zorder=10,
+    )
+    plt.title(
+        "K-means clustering on the digits dataset (PCA-reduced data)\n"
+        "Centroids are marked with white cross"
+    )
+    plt.xlim(x_min, x_max)
+    plt.ylim(y_min, y_max)
+    plt.xticks(())
+    plt.yticks(())
+    return fig
+def bench_k_means(kmeans, name, data, labels):
+    """Benchmark to evaluate the KMeans initialization methods.
+    Parameters
+    ----------
+    kmeans : KMeans instance
+        A :class:`~sklearn.cluster.KMeans` instance with the initialization
+        already set.
+    name : str
+        Name given to the strategy. It will be used to show the results in a
+        table.
+    data : ndarray of shape (n_samples, n_features)
+        The data to cluster.
+    labels : ndarray of shape (n_samples,)
+        The labels used to compute the clustering metrics which requires some
+        supervision.
+    """
+    t0 = time()
+    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
+    fit_time = time() - t0
+    results = [name, fit_time, estimator[-1].inertia_]
+    # Define the metrics which require only the true labels and estimator
+    # labels
+    clustering_metrics = [
+        metrics.homogeneity_score,
+        metrics.completeness_score,
+        metrics.v_measure_score,
+        metrics.adjusted_rand_score,
+        metrics.adjusted_mutual_info_score,
+    ]
+    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]
+    # The silhouette score requires the full dataset
+    results += [
+        metrics.silhouette_score(
+            data,
+            estimator[-1].labels_,
+            metric="euclidean",
+            sample_size=300,
+        )
+    ]
+    return results
+title = "A demo of K-Means clustering on the handwritten digits data"
+def do_submit(kmeans_n_digit,random_n_digit, pca_n_digit):
+    # Load the dataset
+    dataset = load_dataset("sklearn-docs/digits", header=None)
+    # convert dataset to pandas
+    df = dataset['train'].to_pandas()
+    data = df.iloc[:, :64]
+    labels = df.iloc[:, 64]
+    kmeans = KMeans(init="k-means++", n_clusters=int(kmeans_n_digit), n_init=4, random_state=0)
+    results = bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)
+    df = pd.DataFrame(results).T
+    numeric_cols = ['time','inertia','homo','compl','v-meas','ARI','AMI','silhouette']
+    df.columns = ['init'] + numeric_cols
+    kmeans = KMeans(init="random", n_clusters=int(random_n_digit), n_init=4, random_state=0)
+    results = bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)
+    df.loc[len(df.index)] = results
+    pca = PCA(n_components=int(pca_n_digit)).fit(data)
+    kmeans = KMeans(init=pca.components_, n_clusters=int(pca_n_digit), n_init=1)
+    results = bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)
+    df.loc[len(df.index)] = results
+    df[df.columns[1:]] = df.iloc[:,1:].astype(float).round(3)
+    df = df.T #Transpose for display
+    df.columns = df.iloc[0,:].tolist()
+    df = df.iloc[1:,:].reset_index()
+    df.columns = ['metrics', 'k-means++', 'random', 'PCA-based']
+    return display_plot(data, kmeans_n_digit), df
+#Theme from - https://huggingface.co/spaces/trl-lib/stack-llama/blob/main/app.py
+theme = gr.themes.Monochrome(
+    primary_hue="indigo",
+    secondary_hue="blue",
+    neutral_hue="slate",
+    radius_size=gr.themes.sizes.radius_sm,
+    font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"],
+)
+with gr.Blocks(title=title, theme=theme) as demo:
+    gr.Markdown(f"## {title}")
+    gr.Markdown("[Scikit-learn Example](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py)")
+    gr.Markdown("In this example we compare the various initialization strategies for K-means in terms of runtime and quality of the results.")
+    gr.Markdown("As the ground truth is known here, we also apply different cluster quality metrics to judge the goodness of fit of the cluster labels to the ground truth.")
+    gr.Markdown("Cluster quality metrics evaluated (see [Clustering performance evaluation](https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation) \
+                for definitions and discussions of the metrics):")
+    with gr.Row():
+        with gr.Column(scale=0.5):
+            kmeans_n_digit = gr.Slider(minimum=2, maximum=10, label="KMeans n_digits",  step=1, value=10)
+            random_n_digit = gr.Slider(minimum=2, maximum=10, label="Random n_digits",  step=1, value=10)
+            pca_n_digit = gr.Slider(minimum=2, maximum=10, label="PCA n_digits",step=1, value=10)
+            plt_out = gr.Plot()
+        with gr.Column(scale=0.5):
+            sample_df = pd.DataFrame(np.zeros((9,4)),columns=['metrics', 'k-means++', 'random', 'PCA-based'])
+            output = gr.Dataframe(sample_df, label="Output Table")
+    with gr.Row():
+            sub_btn = gr.Button("Submit")
+            sub_btn.click(fn=do_submit, inputs=[kmeans_n_digit,random_n_digit, pca_n_digit], outputs=[plt_out, output])
+demo.launch()