Source code for feature

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import (MinMaxScaler, PowerTransformer,
                                   QuantileTransformer, RobustScaler,
                                   StandardScaler)
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 2021.09.14 Created by Daniel SY wang


[docs]def get_corr(data, method='pearson', threshold=0.7): """ | 根据相关系数来选择特征,消除多重共线性的特征 | Return the columns pair and correlation coefficient with correlation coefficient great threshold as a pandas Dataframe Parameters ---------- data : df pandas Dataframe method : string 'pearson','spearman','kendall' threshold : float default 0.7 Returns ------- df : df pandas Dataframe with null count and percent """ corr = data.corr(method=method) x_list = list(corr.columns) y_list = x_list.copy() records = [] for x in x_list: y_list.pop(0) if y_list: for y in y_list: if abs(corr.loc[x, y]) > threshold: records.append([x, y, corr.loc[x, y]]) df = pd.DataFrame(records, columns=['Feature1', 'Feature2', 'Corr']) df = df.sort_values('Feature1', ascending=True).reset_index(drop=True) return df
[docs]def get_VIF(data): """ | 根据VIF来选择特征,消除多重共线性 | detect the multicollinear features using the variance inflation factor Parameters ---------- data : df pandas Dataframe Returns ------- vif_info : df pandas Dataframe with VIF value """ vif_info = pd.DataFrame() vif_info['VIF'] = [variance_inflation_factor( data.values, i) for i in range(data.shape[1])] vif_info['Column'] = data.columns vif_info = vif_info.sort_values( 'VIF', ascending=True).reset_index(drop=True) return vif_info
[docs]def normal_test(data, method='jb'): """ 检验数据是否服从正态分布,三种检验方案都是用于大数据分析\n | 提出假设:x从正态分布。 | P值>指定水平0.05,接受原假设,可以认为样本数据在5%的显著水平下服从正态分布,以'normal'表示. | P值<指定水平0.05,拒绝原假设,认为样本数据在5%的显著水平下不服从正态分布,以'-'表示. 有三种备选检测方案: stats.jarque_bera(data) stats.kstest(rvs, cdf, args=(), N=20, alternative=’two_sided’, mode=’approx’, \*\*kwds) 对于正态性检验,我们只需要设置三个参数即可: rvs:待检验的数据 cdf:检验方法,这里我们设置为‘norm’,即正态性检验 alternative:默认为双尾检验,可以设置为‘less’或‘greater’作单尾检验。 scipy.stats.normaltest(a, axis=0, nan_policy=’propagate’) 这里的三个参数都有必要看一下: a:待检验的数据 axis:默认为0,表示在0轴上检验,即对数据的每一行做正态性检验,我们可以设置为 axis=None 来对整个数据做检验 nan_policy:当输入的数据中有空值时的处理办法。默认为 ‘propagate’,返回空值; 设置为 ‘raise’ 时,抛出错误;设置为 ‘omit’ 时,在计算中忽略空值。 Parameters ---------- data : df pandas Dataframe method : string 'jb'(default), 'ks' , 'norm' Returns ------- data_test : df pandas Dataframe """ col_test = [] for col in data.columns: s = data[col] s.replace([np.inf, -np.inf], np.nan, inplace=True) s = s[s.notnull()] if method == 'jb': _, pval = stats.jarque_bera(s) elif method == 'ks': _, pval = stats.kstest(s, 'norm') elif method == 'norm': _, pval = stats.normaltest(s, axis=None) if pval < 0.05: result = '-' else: result = 'Normal' col_test.append(result) data_test = pd.DataFrame(data=col_test, index=data.columns) return data_test
[docs]def transformer(data, trans_method_list='all', test_method='jb'): """ 先对数据集进行转换,然后检验转换后的数据是否服从正态分布\n Parameters ---------- data : df pandas Dataframe trans_method_list : list a list of 'sqrt','cbrt','log1p','reciprocal', 'square','cube','expm1','minmax','zscore', 'robust','quant','box','yeo'. user can choice some transfomer method as a list. such as: list of 'minmax','zscore','robust','box','yeo'. default includes all method. test_method : string 'jb'(default), 'ks' , 'norm' Returns ------- result : df pandas Dataframe """ if trans_method_list == 'all': trans_method_list = ['sqrt', 'cbrt', 'log1p', 'reciprocal', 'square', 'cube', 'expm1', 'minmax', 'zscore', 'robust', 'quant', 'box', 'yeo'] result = normal_test(data, method=test_method) for trans_method in trans_method_list: if trans_method == 'sqrt': temp_data = np.sqrt(data) elif trans_method == 'cbrt': temp_data = np.cbrt(data) elif trans_method == 'log1p': temp_data = np.log1p(data) elif trans_method == 'reciprocal': temp_data = np.reciprocal(data) elif trans_method == 'square': temp_data = np.square(data) elif trans_method == 'cube': temp_data = np.power(data, 3) elif trans_method == 'expm1': temp_data = np.expm1(data) elif trans_method == 'minmax': trans = MinMaxScaler() elif trans_method == 'zscore': trans = StandardScaler() elif trans_method == 'robust': trans = RobustScaler() elif trans_method == 'quant': trans = QuantileTransformer( n_quantiles=500, output_distribution='normal', random_state=100) elif trans_method == 'box': # The Box-Cox transformation can only be applied to strictly positive data trans = PowerTransformer(method='box-cox') elif trans_method == 'yeo': trans = PowerTransformer(method='yeo-johnson') if trans_method in ['minmax', 'zscore', 'robust', 'quant', 'box', 'yeo']: temp_data = pd.DataFrame( trans.fit_transform(data), columns=data.columns) temp = normal_test(temp_data, method=test_method) result = pd.concat([result, temp], axis=1) result.columns = ['orginal']+trans_method_list return result