Source code for pypots.nn.functional.clustering

"""
Evaluation metrics related to clustering.
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

import numpy as np
from sklearn import metrics


[docs] def calc_rand_index( class_predictions: np.ndarray, targets: np.ndarray, ) -> float: """Calculate Rand Index, a measure of the similarity between two data clusterings. Refer to :cite:`rand1971RandIndex`. Parameters ---------- class_predictions : Clustering results returned by a clusterer. targets : Ground truth (correct) clustering results. Returns ------- RI : Rand index. References ---------- .. L. Hubert and P. Arabie, Comparing Partitions, Journal of Classification 1985 https://link.springer.com/article/10.1007%2FBF01908075 .. https://en.wikipedia.org/wiki/Simple_matching_coefficient .. https://en.wikipedia.org/wiki/Rand_index """ # # detailed implementation # n = len(targets) # TP = 0 # TN = 0 # for i in range(n - 1): # for j in range(i + 1, n): # if targets[i] != targets[j]: # if class_predictions[i] != class_predictions[j]: # TN += 1 # else: # if class_predictions[i] == class_predictions[j]: # TP += 1 # # RI = n * (n - 1) / 2 # RI = (TP + TN) / RI RI = metrics.rand_score(targets, class_predictions) return RI
[docs] def calc_adjusted_rand_index( class_predictions: np.ndarray, targets: np.ndarray, ) -> float: """Calculate adjusted Rand Index. Parameters ---------- class_predictions : Clustering results returned by a clusterer. targets : Ground truth (correct) clustering results. Returns ------- aRI : Adjusted Rand index. References ---------- .. [1] `L. Hubert and P. Arabie, Comparing Partitions, Journal of Classification 1985 <https://link.springer.com/article/10.1007%2FBF01908075>`_ .. [2] `D. Steinley, Properties of the Hubert-Arabie adjusted Rand index, Psychological Methods 2004 <https://psycnet.apa.org/record/2004-17801-007>`_ .. [3] https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index """ aRI = metrics.adjusted_rand_score(targets, class_predictions) return aRI
[docs] def calc_nmi( class_predictions: np.ndarray, targets: np.ndarray, ) -> float: """Calculate Normalized Mutual Information between two clusterings. Parameters ---------- class_predictions : Clustering results returned by a clusterer. targets : Ground truth (correct) clustering results. Returns ------- NMI : float, Normalized Mutual Information """ NMI = metrics.normalized_mutual_info_score(targets, class_predictions) return NMI
[docs] def calc_cluster_purity( class_predictions: np.ndarray, targets: np.ndarray, ) -> float: """Calculate cluster purity. Parameters ---------- class_predictions : Clustering results returned by a clusterer. targets : Ground truth (correct) clustering results. Returns ------- cluster_purity : cluster purity. Notes ----- This function is from the answer https://stackoverflow.com/a/51672699 on StackOverflow. """ contingency_matrix = metrics.cluster.contingency_matrix(targets, class_predictions) cluster_purity = np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) return cluster_purity
[docs] def calc_external_cluster_validation_metrics( class_predictions: np.ndarray, targets: np.ndarray, ) -> dict: """Computer all external cluster validation metrics available in PyPOTS and return as a dictionary. Parameters ---------- class_predictions : Clustering results returned by a clusterer. targets : Ground truth (correct) clustering results. Returns ------- external_cluster_validation_metrics : dict A dictionary contains all external cluster validation metrics available in PyPOTS. """ ri = calc_rand_index(class_predictions, targets) ari = calc_adjusted_rand_index(class_predictions, targets) nmi = calc_nmi(class_predictions, targets) cp = calc_cluster_purity(class_predictions, targets) external_cluster_validation_metrics = { "rand_index": ri, "adjusted_rand_index": ari, "nmi": nmi, "cluster_purity": cp, } return external_cluster_validation_metrics
[docs] def calc_silhouette(X: np.ndarray, predicted_labels: np.ndarray) -> float: """Compute the mean Silhouette Coefficient of all samples. Parameters ---------- X : array-like of shape (n_samples_a, n_features) A feature array, or learned latent representation, that can be used for clustering. predicted_labels : array-like of shape (n_samples) Predicted labels for each sample. Returns ------- silhouette_score : float Mean Silhouette Coefficient for all samples. In short, the higher, the better. References ---------- .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis". Computational and Applied Mathematics 20: 53-65. <https://www.sciencedirect.com/science/article/pii/0377042787901257>`_ .. [2] `Wikipedia entry on the Silhouette Coefficient <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_ """ silhouette_score = metrics.silhouette_score(X, predicted_labels) return silhouette_score
[docs] def calc_chs(X: np.ndarray, predicted_labels: np.ndarray) -> float: """Compute the Calinski and Harabasz score (also known as the Variance Ratio Criterion). X : array-like of shape (n_samples_a, n_features) A feature array, or learned latent representation, that can be used for clustering. predicted_labels : array-like of shape (n_samples) Predicted labels for each sample. Returns ------- calinski_harabasz_score : float The resulting Calinski-Harabasz score. In short, the higher, the better. References ---------- .. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster analysis". Communications in Statistics <https://www.tandfonline.com/doi/abs/10.1080/03610927408827101>`_ """ calinski_harabasz_score = metrics.calinski_harabasz_score(X, predicted_labels) return calinski_harabasz_score
[docs] def calc_dbs(X: np.ndarray, predicted_labels: np.ndarray) -> float: """Compute the Davies-Bouldin score. Parameters ---------- X : array-like of shape (n_samples_a, n_features) A feature array, or learned latent representation, that can be used for clustering. predicted_labels : array-like of shape (n_samples) Predicted labels for each sample. Returns ------- davies_bouldin_score : float The resulting Davies-Bouldin score. In short, the lower, the better. References ---------- .. [1] `Davies, David L.; Bouldin, Donald W. (1979). "A Cluster Separation Measure" IEEE Transactions on Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227 <https://ieeexplore.ieee.org/document/4766909>`_ """ davies_bouldin_score = metrics.davies_bouldin_score(X, predicted_labels) return davies_bouldin_score
[docs] def calc_internal_cluster_validation_metrics(X: np.ndarray, predicted_labels: np.ndarray) -> dict: """Computer all internal cluster validation metrics available in PyPOTS and return as a dictionary. Parameters ---------- X : array-like of shape (n_samples_a, n_features) A feature array, or learned latent representation, that can be used for clustering. predicted_labels : array-like of shape (n_samples) Predicted labels for each sample. Returns ------- internal_cluster_validation_metrics : dict A dictionary contains all internal cluster validation metrics available in PyPOTS. """ silhouette_score = calc_silhouette(X, predicted_labels) calinski_harabasz_score = calc_chs(X, predicted_labels) davies_bouldin_score = calc_dbs(X, predicted_labels) internal_cluster_validation_metrics = { "silhouette_score": silhouette_score, "calinski_harabasz_score": calinski_harabasz_score, "davies_bouldin_score": davies_bouldin_score, } return internal_cluster_validation_metrics