在上一篇文章《数据准备<4>:变量筛选-理论篇》中,我们介绍了变量筛选的三种方法:基于经验的方法、基于统计的方法和基于机器学习的方法,本文将介绍后两种方法在Python(sklearn)环境下的具体实现。
1.环境介绍版本:python2.7
工具:Spyder
开发人:hbsygfz
数据集:sklearn中自带的cancer数据集,可参考官方介绍
from sklearn.datasets import load_breast_cancer cancer = load_breast_cancer() data = cancer.data feature_names = cancer.feature_names target = cancer.target target_names = cancer.target_names print("数据集大小:{}".format(data.shape)) print("特征:{}".format(feature_names)) print("目标变量:{}".format(target_names)) Out[1]: 数据集大小:(569, 30) 特征:['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'mean smoothness' 'mean compactness' 'mean concavity' 'mean concave points' 'mean symmetry' 'mean fractal dimension' 'radius error' 'texture error' 'perimeter error' 'area error' 'smoothness error' 'compactness error' 'concavity error' 'concave points error' 'symmetry error' 'fractal dimension error' 'worst radius' 'worst texture' 'worst perimeter' 'worst area' 'worst smoothness' 'worst compactness' 'worst concavity' 'worst concave points' 'worst symmetry' 'worst fractal dimension'] 目标变量:['malignant' 'benign']由上可知,cancer数据集是一个(569,30)大小的数据集,共有30个特征,569条样本,目标变量为二分类,取值分别为'malignant'和'benign'。
从数据上看,发现这30个特征均为连续型数据。结合官方介绍,了解到目标变量的分布及含义如下:
3.代码实现 3.1 基于统计的方法 (0)准备工作 ## 1.数据集导入 dataDf = pd.DataFrame(data,columns=list(feature_names)) targetSer = pd.Series(target) targetSer.name = 'is_benign' dataset = pd.concat([dataDf,targetSer],axis=1) #合并特征数据和目标变量数据 discreteColList = [] #离散型变量 continuousColList = list(feature_names) #连续型变量 targetCol = 'is_benign' #目标变量 ## 2.数据预处理 # 定义连续变量分箱函数 def binCreate(df,bins): colList = df.columns resDf = pd.DataFrame(columns=colList) m,n = df.shape referSer = pd.Series(range(m)) referSer.name = 'rank' lableSer = pd.qcut(referSer, bins, labels=range(bins)) lableSer.name = 'bin' lableDF = pd.concat([referSer,lableSer], axis=1) #顺序与箱号合并 for col in colList: rankSer = df[col].rank(method='min') rankSer.name = 'rank' rankDF = pd.concat([df[col],rankSer], axis=1) binsDF = pd.merge(rankDF, lableDF, on='rank', how='left') resDf[col] = binsDF['bin'] return resDf # 定义区间(类别)分布统计函数 def binDistStatistic(df,tag): colList = list(df.columns) #转成列表 colList.remove(tag) #删除目标变量 resDf = pd.DataFrame(columns=['colName','bin','binAllCnt','binPosCnt','binNegCnt','binPosRto','binNegRto']) for col in colList: allSer = df.groupby(col)[tag].count() #计算样本数 allSer = allSer[allSer>0] #剔除无效区间 allSer.name = 'binAllCnt' #定义列名 posSer = df.groupby(col)[tag].sum() #计算正样本数 posSer = posSer[allSer.index] #剔除无效区间 posSer.name = 'binPosCnt' #定义列名 tmpDf = pd.concat([allSer,posSer], axis=1) #合并统计结果 tmpDf = tmpDf.reset_index() #行索引转为一列 tmpDf = tmpDf.rename(columns={col:'bin'}) #修改区间列列名 tmpDf['colName'] = col #增加字段名称列 tmpDf['binNegCnt'] = tmpDf['binAllCnt'] - tmpDf['binPosCnt'] #计算负样本数 tmpDf['binPosRto'] = tmpDf['binPosCnt'] * 1.0000 / tmpDf['binAllCnt'] #计算正样本比例 tmpDf['binNegRto'] = tmpDf['binNegCnt'] * 1.0000 / tmpDf['binAllCnt'] #计算负样本比例 tmpDf = tmpDf.reindex(columns=['colName','bin','binAllCnt','binPosCnt','binNegCnt','binPosRto','binNegRto']) #索引重排 resDf = pd.concat([resDf,tmpDf]) #结果追加 rows, cols = df.shape posCnt = df[tag].sum() resDf['allCnt'] = rows #总体样本数 resDf['posCnt'] = posCnt #总体正样本数 resDf['negCnt'] = rows - posCnt #总体负样本数 resDf['posRto'] = posCnt * 1.0000 / rows #总体正样本比例 resDf['negRto'] = (rows - posCnt) * 1.0000 / rows #总体负样本比例 resDf['binPosCov'] = resDf['binPosCnt'] / resDf['posCnt'] resDf['binNegCov'] = resDf['binNegCnt'] / resDf['negCnt'] return resDf # 定义区间(类别)属性统计函数 def binAttrStatistic(df,cont,disc,bins): m,n = df.shape referSer = pd.Series(range(m)) referSer.name = 'rank' lableSer = pd.qcut(referSer, bins, labels=range(bins)) lableSer.name = 'bin' lableDF = pd.concat([referSer,lableSer], axis=1) #顺序与箱号合并 resDf = pd.DataFrame(columns=['colName','bin','minVal','maxVal','binInterval']) for col in cont: rankSer = df[col].rank(method='min') rankSer.name = 'rank' rankDF = pd.concat([df[col],rankSer], axis=1) binsDF = pd.merge(rankDF, lableDF, on='rank', how='left') minSer = binsDF.groupby('bin')[col].min() minSer.name = 'minVal' maxSer = binsDF.groupby('bin')[col].max() maxSer.name = 'maxVal' tmpDf = pd.concat([minSer,maxSer], axis=1) tmpDf = tmpDf.reset_index() tmpDf['colName'] = col tmpDf['binInterval'] = tmpDf['minVal'].astype('str') + '-' + tmpDf['maxVal'].astype('str') tmpDf = tmpDf.reindex(columns=['colName','bin','minVal','maxVal','binInterval']) tmpDf = tmpDf[tmpDf['binInterval']!='nan-nan'] resDf = pd.concat([resDf,tmpDf]) for col in disc: binSer = pd.Series(df[col].unique()) tmpDf = pd.concat([binSer,binSer], axis=1) tmpDf['colName'] = col tmpDf.rename(columns={0:'bin',1:'binInterval'}, inplace = True) tmpDf = tmpDf.reindex(columns=['colName','bin','minVal','maxVal','binInterval']) resDf = pd.concat([resDf,tmpDf]) return resDf # 定义结果合并函数 def binStatistic(df,cont,disc,tag,bins): binResDf = binCreate(df[cont], bins) # 连续变量分箱 binData = pd.concat([binResDf,df[disc],df[tag]], axis=1) #合并离散变量与目标变量 binDistStatResDf = binDistStatistic(binData,tag) #对分箱后数据集进行分布统计 binAttrStatResDf = binAttrStatistic(df,cont,disc,bins) #区间(类别)大小统计 binStatResDf = pd.merge(binDistStatResDf, binAttrStatResDf, left_on=['colName','bin'], right_on=['colName','bin'], how='left') resDf = binStatResDf.reindex(columns=['colName','bin','binInterval','minVal','maxVal','binAllCnt','binPosCnt','binNegCnt','binPosRto','binNegRto','allCnt','posCnt','negCnt','posRto','negRto','binPosCov','binNegCov']) return resDf