数据处理 —— 出租车gps提取订单数据( 三 )


最后发现不如单线程跑得快,因此最终程序为
"""@author:HY@time:2021/11/1:16:51"""import pandas as pdimport timeimport picklefrom tqdm import tqdmimport datetimeclass Request:def __init__(self, start_time, s_lng, s_lat):self.start_time = start_timeself.s_lng = s_lngself.s_lat = s_latself.e_lng = Noneself.e_lat = Noneself.end_time = Nonedef solve_file(name, file):taxi_dic = {}for index, row in tqdm(file.iterrows(), desc='处理成逐个车辆'):date, taxi_time, _, plate_number, lng, lat, _, _, state, _ = rowif plate_number in taxi_dic.keys():taxi_dic[plate_number].append([taxi_time, state, lng, lat])else:taxi_dic[plate_number] = [[taxi_time, state, lng, lat],]print('车辆数目', len(taxi_dic.keys()))with open('数据文件/单线程-车辆数据字典.pickle', 'wb') as f:pickle.dump(taxi_dic, f)return taxi_dicdef sum_request(taxi_dic):req_list = []for _, value in tqdm(taxi_dic.items(), desc='处理每个车辆'):sort_date = sorted(value, key=lambda item: item[0])vehicle_req_list = get_request(sort_date)req_list.extend(vehicle_req_list)# 保存订单数据with open('数据文件/shenzhen_req.pickle', 'wb') as f:pickle.dump(req_list, f)def get_request(sort_data):"""sort_data是一个列表,每个元素是一个列表 now, state, lng, lat"""last = [None, 0, None, None]request_list = []one_request = Nonefor s in sort_data:now, state, lng, lat = s# 此刻状态last_now, last_state, last_lng, last_lat = last# 之前状态if last_state == 0:# 之前是没有人的状态,找1建立requestif state == 0:continue# 中间的0全部跳过elif state == 1:# 找到了一个首字母1,建立request并且last记录为当前one_request = Request(now, lng, lat)last = [now, state, lng, lat]elif last_state == 1:# 有人的状态,找0前面的1if state == 0:one_request.end_time = last_nowone_request.e_lng = last_lngone_request.e_lat = last_latrequest_list.append(one_request)last = [now, state, lng, lat]last_now, last_state, last_lng, last_lat = last# 之前状态if last_state == 1:one_request.end_time = last_nowone_request.e_lng = last_lngone_request.e_lat = last_latrequest_list.append(one_request)return request_listdef get_file():"""处理原文件获得订单:return:"""time1 = time.time()df = pd.read_csv('20160920_taxigps.csv', header=None, sep='\t')time2 = time.time()taxi_dic = solve_file('all', df)time3 = time.time()sum_request(taxi_dic)time4 = time.time()print(f'读数据时间{time2-time1},处理成单车辆时间{time3-time2},统计订单时间{time4-time3}')