幸福感预测 Task14:集成学习案例一( 二 )


使用众数处理缺失值
data.loc[data['leisure_4']<0,'leisure_4'] = data['leisure_4'].mode() #取众数data.loc[data['leisure_5']<0,'leisure_5'] = data['leisure_5'].mode()data.loc[data['leisure_6']<0,'leisure_6'] = data['leisure_6'].mode()data.loc[data['leisure_7']<0,'leisure_7'] = data['leisure_7'].mode()data.loc[data['leisure_8']<0,'leisure_8'] = data['leisure_8'].mode()data.loc[data['leisure_9']<0,'leisure_9'] = data['leisure_9'].mode()data.loc[data['leisure_10']<0,'leisure_10'] = data['leisure_10'].mode()data.loc[data['leisure_11']<0,'leisure_11'] = data['leisure_11'].mode()data.loc[data['leisure_12']<0,'leisure_12'] = data['leisure_12'].mode()data.loc[data['socialize']<0,'socialize'] = 2 #很少data.loc[data['relax']<0,'relax'] = 4 #经常data.loc[data['learn']<0,'learn'] = 1 #从不 , 哈哈哈哈#对‘社交’处理data.loc[data['social_neighbor']<0,'social_neighbor'] = 0data.loc[data['social_friend']<0,'social_friend'] = 0data.loc[data['socia_outing']<0,'socia_outing'] = 1data.loc[data['neighbor_familiarity']<0,'social_neighbor']= 4#对‘社会公平性’处理data.loc[data['equity']<0,'equity'] = 4#对‘社会等级’处理data.loc[data['class_10_before']<0,'class_10_before'] = 3data.loc[data['class']<0,'class'] = 5data.loc[data['class_10_after']<0,'class_10_after'] = 5data.loc[data['class_14']<0,'class_14'] = 2#对‘工作情况’处理data.loc[data['work_status']<0,'work_status'] = 0data.loc[data['work_yr']<0,'work_yr'] = 0data.loc[data['work_manage']<0,'work_manage'] = 0data.loc[data['work_type']<0,'work_type'] = 0#对‘社会保障’处理data.loc[data['insur_1']<0,'insur_1'] = 1data.loc[data['insur_2']<0,'insur_2'] = 1data.loc[data['insur_3']<0,'insur_3'] = 1data.loc[data['insur_4']<0,'insur_4'] = 1data.loc[data['insur_1']==0,'insur_1'] = 0data.loc[data['insur_2']==0,'insur_2'] = 0data.loc[data['insur_3']==0,'insur_3'] = 0data.loc[data['insur_4']==0,'insur_4'] = 0
年龄分层
#144+1 =145#继续进行特殊的列进行数据处理#读happiness_index.xlsxdata['survey_time'] = pd.to_datetime(data['survey_time'], format='%Y-%m-%d',errors='coerce')#防止时间格式不同的报错errors='coerce‘data['survey_time'] = data['survey_time'].dt.year #仅仅是year , 方便计算年龄data['age'] = data['survey_time']-data['birth']# print(data['age'],data['survey_time'],data['birth'])#年龄分层 145+1=146bins = [0,17,26,34,50,63,100]#把年龄进行分层data['age_bin'] = pd.cut(data['age'], bins, labels=[0,1,2,3,4,5])
5. 数据增广
除了基本数据之外 , 每个属性并不是孤立存在的 , 属性之间的联系也是一个非常重要的事情 , 有时候 , 挖掘好数据之间的关系 , 对于模型的效果会有很大提升 。
部分增广数据
#第一次结婚年龄 147data['marital_1stbir'] = data['marital_1st'] - data['birth'] #最近结婚年龄 148data['marital_nowtbir'] = data['marital_now'] - data['birth'] #是否再婚 149#用最近结婚年龄减去第一次结婚年龄 , 若不为0 , 则表示再婚data['mar'] = data['marital_nowtbir'] - data['marital_1stbir']#配偶年龄 150#现在配偶结婚时年龄data['marital_sbir'] = data['marital_now']-data['s_birth']#配偶年龄差 151data['age_'] = data['marital_nowtbir'] - data['marital_sbir'] #收入比 151+7 =158#与配偶收入比data['income/s_income'] = data['income']/(data['s_income']+1)#与配偶收入和data['income+s_income'] = data['income']+(data['s_income']+1)#自己主要收入在家庭收入中的占比data['income/family_income'] = data['income']/(data['family_income']+1)#自己全部收入在家庭收入中的占比data['all_income/family_income'] = (data['income']+data['s_income'])/(data['family_income']+1)data['income/inc_exp'] = data['income']/(data['inc_exp']+1)data['family_income/m'] = data['family_income']/(data['family_m']+0.01)data['income/m'] = data['income']/(data['family_m']+0.01)#收入/面积比 158+4=162data['income/floor_area'] = data['income']/(data['floor_area']+0.01)data['all_income/floor_area'] = (data['income']+data['s_income'])/(data['floor_area']+0.01)data['family_income/floor_area'] = data['family_income']/(data['floor_area']+0.01)data['floor_area/m'] = data['floor_area']/(data['family_m']+0.01)#class 162+3=165data['class_10_diff'] = (data['class_10_after'] - data['class'])data['class_diff'] = data['class'] - data['class_10_before']data['class_14_diff'] = data['class'] - data['class_14']#悠闲指数 166leisure_fea_lis = ['leisure_'+str(i) for i in range(1,13)]data['leisure_sum'] = data[leisure_fea_lis].sum(axis=1) #skew#满意指数 167public_service_fea_lis = ['public_service_'+str(i) for i in range(1,10)]data['public_service_sum'] = data[public_service_fea_lis].sum(axis=1) #skew#信任指数 168trust_fea_lis = ['trust_'+str(i) for i in range(1,14)]data['trust_sum'] = data[trust_fea_lis].sum(axis=1) #skew#province mean 168+13=181data['province_income_mean'] = data.groupby(['province'])['income'].transform('mean').valuesdata['province_family_income_mean'] = data.groupby(['province'])['family_income'].transform('mean').valuesdata['province_equity_mean'] = data.groupby(['province'])['equity'].transform('mean').valuesdata['province_depression_mean'] = data.groupby(['province'])['depression'].transform('mean').valuesdata['province_floor_area_mean'] = data.groupby(['province'])['floor_area'].transform('mean').valuesdata['province_health_mean'] = data.groupby(['province'])['health'].transform('mean').valuesdata['province_class_10_diff_mean'] = data.groupby(['province'])['class_10_diff'].transform('mean').valuesdata['province_class_mean'] = data.groupby(['province'])['class'].transform('mean').valuesdata['province_health_problem_mean'] = data.groupby(['province'])['health_problem'].transform('mean').valuesdata['province_family_status_mean'] = data.groupby(['province'])['family_status'].transform('mean').valuesdata['province_leisure_sum_mean'] = data.groupby(['province'])['leisure_sum'].transform('mean').valuesdata['province_public_service_sum_mean'] = data.groupby(['province'])['public_service_sum'].transform('mean').valuesdata['province_trust_sum_mean'] = data.groupby(['province'])['trust_sum'].transform('mean').values#citymean 181+13=194data['city_income_mean'] = data.groupby(['city'])['income'].transform('mean').valuesdata['city_family_income_mean'] = data.groupby(['city'])['family_income'].transform('mean').valuesdata['city_equity_mean'] = data.groupby(['city'])['equity'].transform('mean').valuesdata['city_depression_mean'] = data.groupby(['city'])['depression'].transform('mean').valuesdata['city_floor_area_mean'] = data.groupby(['city'])['floor_area'].transform('mean').valuesdata['city_health_mean'] = data.groupby(['city'])['health'].transform('mean').valuesdata['city_class_10_diff_mean'] = data.groupby(['city'])['class_10_diff'].transform('mean').valuesdata['city_class_mean'] = data.groupby(['city'])['class'].transform('mean').valuesdata['city_health_problem_mean'] = data.groupby(['city'])['health_problem'].transform('mean').valuesdata['city_family_status_mean'] = data.groupby(['city'])['family_status'].transform('mean').valuesdata['city_leisure_sum_mean'] = data.groupby(['city'])['leisure_sum'].transform('mean').valuesdata['city_public_service_sum_mean'] = data.groupby(['city'])['public_service_sum'].transform('mean').valuesdata['city_trust_sum_mean'] = data.groupby(['city'])['trust_sum'].transform('mean').values#countymean 194 + 13 = 207data['county_income_mean'] = data.groupby(['county'])['income'].transform('mean').valuesdata['county_family_income_mean'] = data.groupby(['county'])['family_income'].transform('mean').valuesdata['county_equity_mean'] = data.groupby(['county'])['equity'].transform('mean').valuesdata['county_depression_mean'] = data.groupby(['county'])['depression'].transform('mean').valuesdata['county_floor_area_mean'] = data.groupby(['county'])['floor_area'].transform('mean').valuesdata['county_health_mean'] = data.groupby(['county'])['health'].transform('mean').valuesdata['county_class_10_diff_mean'] = data.groupby(['county'])['class_10_diff'].transform('mean').valuesdata['county_class_mean'] = data.groupby(['county'])['class'].transform('mean').valuesdata['county_health_problem_mean'] = data.groupby(['county'])['health_problem'].transform('mean').valuesdata['county_family_status_mean'] = data.groupby(['county'])['family_status'].transform('mean').valuesdata['county_leisure_sum_mean'] = data.groupby(['county'])['leisure_sum'].transform('mean').valuesdata['county_public_service_sum_mean'] = data.groupby(['county'])['public_service_sum'].transform('mean').valuesdata['county_trust_sum_mean'] = data.groupby(['county'])['trust_sum'].transform('mean').values#ratio 相比同省 207 + 13 =220data['income/province'] = data['income']/(data['province_income_mean'])data['family_income/province'] = data['family_income']/(data['province_family_income_mean'])data['equity/province'] = data['equity']/(data['province_equity_mean'])data['depression/province'] = data['depression']/(data['province_depression_mean'])data['floor_area/province'] = data['floor_area']/(data['province_floor_area_mean'])data['health/province'] = data['health']/(data['province_health_mean'])data['class_10_diff/province'] = data['class_10_diff']/(data['province_class_10_diff_mean'])data['class/province'] = data['class']/(data['province_class_mean'])data['health_problem/province'] = data['health_problem']/(data['province_health_problem_mean'])data['family_status/province'] = data['family_status']/(data['province_family_status_mean'])data['leisure_sum/province'] = data['leisure_sum']/(data['province_leisure_sum_mean'])data['public_service_sum/province'] = data['public_service_sum']/(data['province_public_service_sum_mean'])data['trust_sum/province'] = data['trust_sum']/(data['province_trust_sum_mean']+1)#ratio 相比同市 220 + 13 =233data['income/city'] = data['income']/(data['city_income_mean'])data['family_income/city'] = data['family_income']/(data['city_family_income_mean'])data['equity/city'] = data['equity']/(data['city_equity_mean'])data['depression/city'] = data['depression']/(data['city_depression_mean'])data['floor_area/city'] = data['floor_area']/(data['city_floor_area_mean'])data['health/city'] = data['health']/(data['city_health_mean'])data['class_10_diff/city'] = data['class_10_diff']/(data['city_class_10_diff_mean'])data['class/city'] = data['class']/(data['city_class_mean'])data['health_problem/city'] = data['health_problem']/(data['city_health_problem_mean'])data['family_status/city'] = data['family_status']/(data['city_family_status_mean'])data['leisure_sum/city'] = data['leisure_sum']/(data['city_leisure_sum_mean'])data['public_service_sum/city'] = data['public_service_sum']/(data['city_public_service_sum_mean'])data['trust_sum/city'] = data['trust_sum']/(data['city_trust_sum_mean'])#ratio 相比同个地区 233 + 13 =246data['income/county'] = data['income']/(data['county_income_mean'])data['family_income/county'] = data['family_income']/(data['county_family_income_mean'])data['equity/county'] = data['equity']/(data['county_equity_mean'])data['depression/county'] = data['depression']/(data['county_depression_mean'])data['floor_area/county'] = data['floor_area']/(data['county_floor_area_mean'])data['health/county'] = data['health']/(data['county_health_mean'])data['class_10_diff/county'] = data['class_10_diff']/(data['county_class_10_diff_mean'])data['class/county'] = data['class']/(data['county_class_mean'])data['health_problem/county'] = data['health_problem']/(data['county_health_problem_mean'])data['family_status/county'] = data['family_status']/(data['county_family_status_mean'])data['leisure_sum/county'] = data['leisure_sum']/(data['county_leisure_sum_mean'])data['public_service_sum/county'] = data['public_service_sum']/(data['county_public_service_sum_mean'])data['trust_sum/county'] = data['trust_sum']/(data['county_trust_sum_mean'])#agemean 246+ 13 =259data['age_income_mean'] = data.groupby(['age'])['income'].transform('mean').valuesdata['age_family_income_mean'] = data.groupby(['age'])['family_income'].transform('mean').valuesdata['age_equity_mean'] = data.groupby(['age'])['equity'].transform('mean').valuesdata['age_depression_mean'] = data.groupby(['age'])['depression'].transform('mean').valuesdata['age_floor_area_mean'] = data.groupby(['age'])['floor_area'].transform('mean').valuesdata['age_health_mean'] = data.groupby(['age'])['health'].transform('mean').valuesdata['age_class_10_diff_mean'] = data.groupby(['age'])['class_10_diff'].transform('mean').valuesdata['age_class_mean'] = data.groupby(['age'])['class'].transform('mean').valuesdata['age_health_problem_mean'] = data.groupby(['age'])['health_problem'].transform('mean').valuesdata['age_family_status_mean'] = data.groupby(['age'])['family_status'].transform('mean').valuesdata['age_leisure_sum_mean'] = data.groupby(['age'])['leisure_sum'].transform('mean').valuesdata['age_public_service_sum_mean'] = data.groupby(['age'])['public_service_sum'].transform('mean').valuesdata['age_trust_sum_mean'] = data.groupby(['age'])['trust_sum'].transform('mean').values# 和同龄人相比259 + 13 =272data['income/age'] = data['income']/(data['age_income_mean'])data['family_income/age'] = data['family_income']/(data['age_family_income_mean'])data['equity/age'] = data['equity']/(data['age_equity_mean'])data['depression/age'] = data['depression']/(data['age_depression_mean'])data['floor_area/age'] = data['floor_area']/(data['age_floor_area_mean'])data['health/age'] = data['health']/(data['age_health_mean'])data['class_10_diff/age'] = data['class_10_diff']/(data['age_class_10_diff_mean'])data['class/age'] = data['class']/(data['age_class_mean'])data['health_problem/age'] = data['health_problem']/(data['age_health_problem_mean'])data['family_status/age'] = data['family_status']/(data['age_family_status_mean'])data['leisure_sum/age'] = data['leisure_sum']/(data['age_leisure_sum_mean'])data['public_service_sum/age'] = data['public_service_sum']/(data['age_public_service_sum_mean'])data['trust_sum/age'] = data['trust_sum']/(data['age_trust_sum_mean'])