Source code for plot

import itertools
import math

import pandas as pd
import numpy as np
from sklearn.preprocessing import (MinMaxScaler, PowerTransformer,
                                   QuantileTransformer, RobustScaler,
                                   StandardScaler)
import matplotlib.pyplot as plt
import seaborn as sns

# 2021.09.14 Created by Daniel SY wang


[docs]def get_colpairlist(col_list): """ 生成列对的列表,为探究多变量间分布关系作准备. colpairlist的结构为[(x1,y1),(x2,y2),...,(xp1,yp1),(xp2,yp2),...,...] Parameters ---------- col_list : list such as ['col1','col2','col3','col4'] Returns ------ colpairlist : list of tuple column pair list for plot,such as [('col1','col2'),('col1','col3'),('col3','col4')] """ # col_list=list(data.columns)[:5] colpairlist = list(itertools.combinations(col_list, 2)) return colpairlist
[docs]def pair_plot(data, colpairlist, ncol=5, figsize_x=20, scale_y=1, plot_type='scatter'): """ 生成多个关系图的列表,探究多变量间分布关系。 绘图方法包括:散点图(scatter)、散点回归图(reg)、回归残差图(resid)、散点分布图(joint)、核密度估计图(kde) Parameters ---------- data : df pandas Dataframe colpairslist : list of tuple column pair list, such as [('col1','col2'),('col1','col3'),('1','col4'),('col2','col3'),('col2','col4')] ncol : int subfigure column numbers in a row figsize_x : float sub-figure width scale_y : float scale height plot_type : string 'scatter'(default), 'reg','resid','joint','kde' Returns ------- return : None plot multiple figure matrix """ nrow = math.ceil(len(colpairlist)/ncol) if plot_type in ['scatter', 'reg', 'resid', 'kde']: figsize_y = nrow*figsize_x/ncol*scale_y fig, axes = plt.subplots( nrows=nrow, ncols=ncol, figsize=(figsize_x, figsize_y)) for colpair, ax in zip(colpairlist, axes.flat): if plot_type == 'scatter': sns.scatterplot(data=data, x=colpair[0], y=colpair[1], ax=ax) elif plot_type == 'reg': sns.regplot(data=data, x=colpair[0], y=colpair[1], ax=ax) elif plot_type == 'resid': sns.residplot( data=data, x=colpair[0], y=colpair[1], ax=ax, lowess=True) elif plot_type == 'kde': sns.kdeplot( data=data, x=colpair[0], y=colpair[1], shade=True, ax=ax) if plot_type == 'joint': for colpair in colpairlist: sns .jointplot(data=data, x=colpair[0], y=colpair[1]) return
[docs]def hist_plot(data, ncol=5, n_bin=100, figsize_x=20, scale_y=1): """ plot histograms of DataFrame Parameters ---------- data : df pandas Dataframe ncol : int subfigure column numbers in a row figsize_x : float sub-figure width scale_y : float scale height Returns ------- return : None plot multiple figures """ nrow = math.ceil(len(list(data.columns))/ncol) figsize_y = nrow*figsize_x/ncol*scale_y data.hist(layout=(-1, ncol), figsize=(figsize_x, figsize_y), bins=n_bin)
[docs]def trans_plot(data, trans_method_list='all', ncol=5, n_bin=100, figsize_x=20, scale_y=1): """ plot histograms of each column and its transformed of DataFrame \n Parameters ---------- data : df pandas Dataframe trans_method_list : list a list of 'sqrt','cbrt','log1p','reciprocal', 'square','cube','expm1','minmax','zscore', 'robust','quant','box','yeo'. user can choice some transfomer method as a list. such as: list of 'minmax','zscore','robust','box','yeo'. default includes all method. ncol : int subfigure column numbers in a row figsize_x : float sub-figure width scale_y : float scale height Returns ------- return : None plot multiple figures """ if trans_method_list == 'all': trans_method_list = ['sqrt', 'cbrt', 'log1p', 'reciprocal', 'square', 'cube', 'expm1', 'minmax', 'zscore', 'robust', 'quant', 'box', 'yeo'] for col in data.columns: data_col = np.array(data[col]).reshape(-1, 1) df = pd.DataFrame(data=data_col, columns=[col]) for trans_method in trans_method_list: try: if trans_method == 'sqrt': temp_data = np.sqrt(data_col) elif trans_method == 'cbrt': temp_data = np.cbrt(data_col) elif trans_method == 'log1p': temp_data = np.log1p(data_col) elif trans_method == 'reciprocal': temp_data = np.reciprocal(data_col) elif trans_method == 'square': temp_data = np.square(data_col) elif trans_method == 'cube': temp_data = np.power(data_col, 3) elif trans_method == 'expm1': temp_data = np.expm1(data_col) elif trans_method == 'minmax': trans = MinMaxScaler() elif trans_method == 'zscore': trans = StandardScaler() elif trans_method == 'robust': trans = RobustScaler() elif trans_method == 'quant': trans = QuantileTransformer( n_quantiles=500, output_distribution='normal', random_state=100) elif trans_method == 'box': # The Box-Cox transformation can only be applied to strictly positive data trans = PowerTransformer(method='box-cox') elif trans_method == 'yeo': trans = PowerTransformer(method='yeo-johnson') if trans_method in ['minmax', 'zscore', 'robust', 'quant', 'box', 'yeo']: temp_data = trans.fit_transform(data_col) col_name = col+'__'+trans_method df[col_name] = temp_data except: pass hist_plot(df, ncol=ncol, n_bin=n_bin, figsize_x=figsize_x, scale_y=scale_y) return