幸福感预测 Task14:集成学习案例一( 三 )


完成数据缺失值异常值处理 , 数据增广之后 , 特征从一开始的131维 , 扩充为了272维的特征 。
print('shape',data.shape)data.head()
之后 , 还要删除一些数值特别少的和之前用过的特征 , 这里总共删除了9个特征
#272-9=263#删除数值特别少的和之前用过的特征del_list=['id','survey_time','edu_other','invest_other','property_other','join_party','province','city','county']use_feature = [clo for clo in data.columns if clo not in del_list]data.fillna(0,inplace=True) #还是补0train_shape = train.shape[0] #一共的数据量 , 训练集features = data[use_feature].columns #删除后所有的特征X_train_263 = data[:train_shape][use_feature].valuesy_train = targetX_test_263 = data[train_shape:][use_feature].valuesX_train_263.shape #最终一种263个特征
处理完成之后 , 还有263维特征 。
这里选择了最重要的49个特征 , 作为除了以上263维特征外的另外一组特征
imp_fea_49 = ['equity','depression','health','class','family_status','health_problem','class_10_after','equity/province','equity/city','equity/county','depression/province','depression/city','depression/county','health/province','health/city','health/county','class/province','class/city','class/county','family_status/province','family_status/city','family_status/county','family_income/province','family_income/city','family_income/county','floor_area/province','floor_area/city','floor_area/county','leisure_sum/province','leisure_sum/city','leisure_sum/county','public_service_sum/province','public_service_sum/city','public_service_sum/county','trust_sum/province','trust_sum/city','trust_sum/county','income/m','public_service_sum','class_diff','status_3_before','age_income_mean','age_floor_area_mean','weight_jin','height_cm','health/age','depression/age','equity/age','leisure_sum/age']train_shape = train.shape[0]X_train_49 = data[:train_shape][imp_fea_49].valuesX_test_49 = data[train_shape:][imp_fea_49].valuesX_train_49.shape #最重要的49个特征
选择需要进行one-hot编码的离散变量进行one-hot编码 , 再合成为第三类特征 , 共383维 。
cat_fea = ['survey_type','gender','nationality','edu_status','political','hukou','hukou_loc','work_exper','work_status','work_type','work_manage','marital','s_political','s_hukou','s_work_exper','s_work_status','s_work_type','f_political','f_work_14','m_political','m_work_14']noc_fea = [clo for clo in use_feature if clo not in cat_fea]onehot_data = http://www.kingceram.com/post/data[cat_fea].valuesenc = preprocessing.OneHotEncoder(categories = 'auto')oh_data=enc.fit_transform(onehot_data).toarray()oh_data.shape #变为onehot编码格式X_train_oh = oh_data[:train_shape,:]X_test_oh = oh_data[train_shape:,:]X_train_oh.shape #其中的训练集X_train_383 = np.column_stack([data[:train_shape][noc_fea].values,X_train_oh])#先是noc , 再是cat_feaX_test_383 = np.column_stack([data[train_shape:][noc_fea].values,X_test_oh])X_train_383.shape