Source code for mlcluster

import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics import (calinski_harabasz_score, davies_bouldin_score,
                             silhouette_score)
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline

# 2021.09.14 Created by Daniel SY wang


[docs]class ClusterScorePlot(Pipeline):
    """
    ClusterScorePlot inherited from Pipeline,score and plot cluster without labels.
    It's same as pipeline+gridseach.


    Parameters
    ----------
    steps : list
        List of (name, transform) tuples (implementing fit/transform) that are
        chained, in the order in which they are chained, with the last object
        a cluster estimator.

    param_grid : list of dict
        usage reference example part.

    scoring : list
        list of 'si','ca','da' for cluster scoring,
        'si' means metrics.silhouette_score,
        'ca' means metrics.calinski_harabasz_score,
        'da' means metrics.davies_bouldin_score.

    memory : str or object with the joblib.Memory interface, default=None
        Used to cache the fitted transformers of the pipeline. By default,
        no caching is performed. If a string is given, it is the path to
        the caching directory. Enabling caching triggers a clone of
        the transformers before fitting. Therefore, the transformer
        instance given to the pipeline cannot be inspected
        directly. Use the attribute ``named_steps`` or ``steps`` to
        inspect estimators within the pipeline. Caching the
        transformers is advantageous when fitting is time consuming.

    verbose : bool, default=False
        If True, the time elapsed while fitting each step will be printed as it
        is completed.

    Attributes
    ----------
    named_steps : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.
        Read-only attribute to access any step parameter by user given name.
        Keys are step names and values are steps parameters.

    Examples
    --------
    Get the best socres and paramters
    ::

        #setup steps:
        cls = [('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('cluster',cluster.AgglomerativeClustering(n_clusters=5))]

        #setup param_grid:
        lr = LinearRegression() 
        imp = IterativeImputer(estimator=lr,missing_values=np.nan, max_iter=10, verbose=2, imputation_order='roman',random_state=0)
        knn = KNNImputer(n_neighbors=2, add_indicator=True)
        param_grid = [ 
            { 'imputer': [SimpleImputer(strategy='median'),imp,knn, None],
            'scaler': [MinMaxScaler(),StandardScaler(),RobustScaler(),None],
            'cluster':[cluster.AgglomerativeClustering(n_clusters=5),
                        cluster.DBSCAN(eps=0.30, min_samples=10),
                        cluster.KMeans(n_clusters=5),
                        GaussianMixture(n_components=5)]}]

        #setup scoring:
        scoring = ['da', 'si', 'ca']

        #get the best scores:
        clustersearch=Cluster_Score_Plot(steps=cls,param_grid=param_grid,scoring=scoring)
        score_df=clustersearch.get_score(data)
        score_df

    Polt scatter diagram for best paramters
    ::

        #param_grid : setup param_grid using best_score_df param list
        best_score_df=clustersearch.get_best_score(score_df,n_best_score=3)
        param_grid=list(best_score_df['param'])
        
        #plot scatter using best_score_df param
        clustersearch=Cluster_Score_Plot(steps=cls,param_grid=best_score_df,scoring=scoring)
        clustersearch.plot(data,param_type='',ncol=1)

    """

    def __init__(self, steps, param_grid, scoring, memory=None, verbose=False):
        self.param_grid = param_grid
        self.scoring = scoring
        self.X_transformed = None
        super().__init__(steps=steps, memory=memory, verbose=verbose)

[docs]    def fit(self, X, y=None, **fit_params):
        """Fit the model

        Fit all the transforms one after the other and transform the
        data, then fit the transformed data using the final estimator.

        Parameters
        ----------
        X : iterable
            Training data. Must fulfill input requirements of first step of the
            pipeline.

        y : iterable, default=None
            Training targets. Must fulfill label requirements for all steps of
            the pipeline.

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of each step, where
            each parameter name is prefixed such that parameter ``p`` for step
            ``s`` has key ``s__p``.

        Returns
        -------
        self : Pipeline
            This estimator
        """

        fit_params_steps = self._check_fit_params(**fit_params)
        Xt = self._fit(X, y, **fit_params_steps)
        # 从数组删除含有nan的行
        self.X_transformed = Xt[~np.isnan(Xt).any(axis=1)]

        if self._final_estimator != 'passthrough':
            fit_params_last_step = fit_params_steps[self.steps[-1][0]]
            self._final_estimator.fit(
                self.X_transformed, y, **fit_params_last_step)

        return self

[docs]    def predict(self, X=None, **predict_params):
        """Apply transforms to the data, and predict with the final estimator

        Parameters
        ----------
        X : iterable
            Data to predict on. Must fulfill input requirements of first step
            of the pipeline.

        **predict_params : dict of string -> object
            Parameters to the ``predict`` called at the end of all
            transformations in the pipeline. Note that while this may be
            used to return uncertainties from some models with return_std
            or return_cov, uncertainties that are generated by the
            transformations in the pipeline are not propagated to the
            final estimator.

            .. versionadded:: 0.20

        Returns
        -------
        y_pred : array-like
        """

        last_step = self._final_estimator
        fit_params_steps = self._check_fit_params(**predict_params)
        fit_params_last_step = fit_params_steps[self.steps[-1][0]]
        if hasattr(last_step, 'fit_predict'):
            return last_step.fit_predict(self.X_transformed, **fit_params_last_step)
        else:
            return last_step.predict(self.X_transformed, **predict_params)

[docs]    def get_score(self, X):
        """Caculate X's cluster score

        Parameters
        ----------
        X : df
            DataFrame

        Returns
        -------
        df_score : df
            DataFrame contain param and scores
        """
        all_score_list = []
        param_grid_list = list(ParameterGrid(self.param_grid))
        for param in param_grid_list:
            self.set_params(**param)
            self.fit(X)
            labels = self.predict()

            score_list = []
            # score_list.append(self.steps)
            score_list.append(param)
            for scorer in self.scoring:

                try:
                    if scorer == 'si':
                        cluster_score = silhouette_score(
                            self.X_transformed, labels)
                    elif scorer == 'ca':
                        cluster_score = calinski_harabasz_score(
                            self.X_transformed, labels)
                    elif scorer == 'da':
                        cluster_score = davies_bouldin_score(
                            self.X_transformed, labels)
                except:
                    cluster_score = np.nan

                score_list.append(cluster_score)

            #score_list.insert(0, self.named_steps)
            all_score_list.append(score_list)

        column_list = self.scoring.copy()
        column_list.insert(0, 'param')
        df_score = pd.DataFrame(all_score_list, columns=column_list)

        return df_score

[docs]    def get_best_score(self, data, n_best_score=3):
        """Return the best score DataFrame

        Parameters
        ----------
        data : df
            DataFrame of param and scores

        n_best_score : n best score for each scoring method

        Returns
        -------
        data_best_score : df
            DataFrame of the best score and param        
        """
        score_list = list(data.columns)
        score_list.remove('param')
        str_a = score_list.pop()
        if str_a == 'da':
            idx1 = data.sort_values(str_a, ascending=True).head(n_best_score).index
        else:
            idx1 = data.sort_values(str_a, ascending=False).head(n_best_score).index
        while score_list:
            str_a = score_list.pop()
            if str_a == 'da':
                idx2 = data.sort_values(str_a, ascending=True).head(n_best_score).index
            else:
                idx2 = data.sort_values(str_a, ascending=False).head(n_best_score).index
            idx1 = idx1.union(idx2)
        data_best_score = data.iloc[idx1]
        return data_best_score

[docs]    def plot(self, X, param_type='grid', ncol=1, figsize_x=10, scale_y=1):
        """Preprocessing transform X,then TSNE transform, plot scatterplot

        Parameters
        ----------
        X : data
            DataFrame

        param_type : string
            'grid'(default) for param_grid,'' for best_score param.

        ncol: int
            figure number in a row

        figsize_x : float
            sub-figure width
        scale_y : float
            scale height

        Returns
        -------
        return : None
            plot multiple figures

        """
        if param_type == 'grid':
            param_grid_list = list(ParameterGrid(self.param_grid))
        else:
            param_grid_list = self.param_grid

        nrow = math.ceil(len(param_grid_list)/ncol)
        figsize_y = nrow*figsize_x/ncol*scale_y
        fig, axes = plt.subplots(
            nrows=nrow, ncols=ncol, figsize=(figsize_x, figsize_y))

        for param, ax in zip(param_grid_list, axes.flat):

            self.set_params(**param)
            self.fit(X)
            labels = self.predict()
            tsne = TSNE(random_state=42)
            X_tsne = tsne.fit_transform(self.X_transformed)

            ax.set_title(str(param))
            sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1], hue=labels, size=labels, style=labels,
                            data=X_tsne, ax=ax)
        return