【人工智能笔记】第五节:基于TensorFlow 2

该模型是典型的数据预测模型 。实现多参数输入含时序,预测多个结果数据 。
输入维度(,历史数据长度,输入参数数量),这里取15个维度,包含:年、月、日、上证指数、深证指数与目标股票(开盘、最高、最低、收盘)*3 。
输出维度(,预测数据长度,输出参数数量),这里取12个维度,包含:上证指数、深证指数与目标股票(开盘、最高、最低、收盘)*3 。
训练流程,将数据整理成 历史数据+正确数据,分批次输入网络,经过编码器,循环解码器,累计loss,最后反向传播更新网络权重,以达到训练的目的 。
预测流程,先将历史数据输入编码器,提取特征后,传递到解码器 。解码器预测一天数据,把该预测数据作为输入,将上一状态与预测结果继续解码器,如此循环预测后续几天的数据 。最后得到下面的预测结果 。
预测效果(前段是已知数据,后段是预测走向):
使用涨跌幅度进行预测,下面为数据加载辅助类 。按区间读取数据用于训练与预测:
import osimport pandasimport datetimeimport randomimport numpy as npimport matplotlib.pyplot as pltimport mpl_finance as mpfplt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号from pandas.plotting import register_matplotlib_convertersregister_matplotlib_converters()class GuPiaoLoader():"""加载股票文件"""def __init__(self):passdef load_one(self, file_name):"""加载数据文件,并把价格转换成升降比例"""print('加载文件', file_name)df = pandas.read_csv(file_name)# 增加列col_name = df.columns.tolist()col_name.append('开盘%')# 默认值为NaNcol_name.append('最高%')# 默认值为NaNcol_name.append('最低%')# 默认值为NaNcol_name.append('收盘%')# 默认值为NaNdf = df.reindex(columns=col_name)# 填充NaN值为0df = df.fillna(value=http://www.kingceram.com/post/0.0)# 第一条记录,用开盘价计算old_plice = df.loc[0, '开盘']df.loc[0, '开盘%'] = df.loc[0, '开盘']/old_plice-1df.loc[0, '最高%'] = df.loc[0, '最高']/old_plice-1df.loc[0, '最低%'] = df.loc[0, '最低']/old_plice-1df.loc[0, '收盘%'] = df.loc[0, '收盘']/old_plice-1for i in range(1, len(df)):old_plice = df.loc[i-1, '收盘']df.loc[i, '开盘%'] = df.loc[i, '开盘']/old_plice-1df.loc[i, '最高%'] = df.loc[i, '最高']/old_plice-1df.loc[i, '最低%'] = df.loc[i, '最低']/old_plice-1df.loc[i, '收盘%'] = df.loc[i, '收盘']/old_plice-1return dfdef get_random_data(self, df_sh, df_sz, df_target, history_size, target_size, start_index=None):"""根据数据窗口获取数据"""data = http://www.kingceram.com/post/[]labels = []# 日期同步tmp_df_sh = df_sh.loc[df_sh['日期']>= df_target.loc[0,'日期'],['日期', '开盘%', '最高%', '最低%', '收盘%']]tmp_df_sz = df_sz.loc[df_sz['日期'] >= df_target.loc[0,'日期'],['日期', '开盘%', '最高%', '最低%', '收盘%']]tmp_df_target = df_target.loc[:, ['日期', '开盘%', '最高%', '最低%', '收盘%']]# 随机取一段时间数据if start_index==None:start_index = random.randint(history_size, len(tmp_df_sh)-target_size)tmp_df_sh = tmp_df_sh[start_index-history_size:start_index+target_size]# 数据归一化tmp_df_sh.loc[:, '开盘%'] = tmp_df_sh.apply(lambda x: x['开盘%'] * 10, axis=1)tmp_df_sh.loc[:, '最高%'] = tmp_df_sh.apply(lambda x: x['最高%'] * 10, axis=1)tmp_df_sh.loc[:, '最低%'] = tmp_df_sh.apply(lambda x: x['最低%'] * 10, axis=1)tmp_df_sh.loc[:, '收盘%'] = tmp_df_sh.apply(lambda x: x['收盘%'] * 10, axis=1)tmp_df_sz.loc[:, '开盘%'] = tmp_df_sz.apply(lambda x: x['开盘%'] * 10, axis=1)tmp_df_sz.loc[:, '最高%'] = tmp_df_sz.apply(lambda x: x['最高%'] * 10, axis=1)tmp_df_sz.loc[:, '最低%'] = tmp_df_sz.apply(lambda x: x['最低%'] * 10, axis=1)tmp_df_sz.loc[:, '收盘%'] = tmp_df_sz.apply(lambda x: x['收盘%'] * 10, axis=1)tmp_df_target.loc[:, '开盘%'] = tmp_df_target.apply(lambda x: x['开盘%'] * 10, axis=1)tmp_df_target.loc[:, '最高%'] = tmp_df_target.apply(lambda x: x['最高%'] * 10, axis=1)tmp_df_target.loc[:, '最低%'] = tmp_df_target.apply(lambda x: x['最低%'] * 10, axis=1)tmp_df_target.loc[:, '收盘%'] = tmp_df_target.apply(lambda x: x['收盘%'] * 10, axis=1)# 合并数据tmp_df_merge = pandas.merge(tmp_df_sh, tmp_df_sz, how='left', on='日期', sort=False,suffixes=('_sh', '_sz'))tmp_df_merge = pandas.merge(tmp_df_merge, tmp_df_target, how='left', on='日期', sort=False)# 删除NaN值tmp_df_merge = tmp_df_merge.dropna()# 增加列col_name = tmp_df_merge.columns.tolist()col_name.insert(1, '年')# 默认值为NaNcol_name.insert(2, '月')# 默认值为NaNcol_name.insert(3, '日')# 默认值为NaNtmp_df_merge = tmp_df_merge.reindex(columns=col_name)# 日期数据归一化tmp_df_merge.loc[:, '年'] = tmp_df_merge.apply(lambda x: (datetime.datetime.strptime(x['日期'],'%Y/%m/%d').year-2000) / 20, axis=1)tmp_df_merge.loc[:, '月'] = tmp_df_merge.apply(lambda x: (datetime.datetime.strptime(x['日期'],'%Y/%m/%d').month) / 12, axis=1)tmp_df_merge.loc[:, '日'] = tmp_df_merge.apply(lambda x: (datetime.datetime.strptime(x['日期'],'%Y/%m/%d').day) / 31, axis=1)return tmp_df_mergedef get_data_to_train(self, df_sh, df_sz, df_target, batch_size, history_size, target_size, start_index=None):"""数据格式化用于训练batch_size:批次大小history_size:训练数据大小target_size:预测数据大小"""x = []y = []for _ in range(batch_size):tmp_df = self.get_random_data(df_sh, df_sz, df_target, history_size, target_size, start_index)tmp_values = tmp_df.values[:,1:]# print('tmp_values', tmp_values.shape)x.append(tmp_values[:history_size,:].tolist())y.append(tmp_values[history_size:history_size+target_size,:].tolist())x = np.array(x)y = np.array(y)return x, ydef get_data_to_predict(self, df_sh, df_sz, df_target, history_size, target_size, start_index=None):"""数据格式化用于训练batch_size:批次大小history_size:训练数据大小target_size:预测数据大小"""if start_index==None:start_index = len(df_target)tmp_df = self.get_random_data(df_sh, df_sz, df_target, history_size, 0, start_index)# print(tmp_df)# 排除日期列tmp_values = tmp_df.values[:,1:]# print('tmp_values', tmp_values.shape)x = tmp_values[:history_size,:]x = np.expand_dims(x, axis=0)time_step = self.create_time(tmp_df.iloc[history_size-1,:].loc['日期'], target_size)time_step = np.expand_dims(time_step, axis=0)return x, time_stepdef data_generator(self, df_sh, df_sz, df_target, batch_size, history_size, target_size):"""循环生成数据"""while True:x, y = self.get_data_to_train(df_sh, df_sz, df_target, batch_size, history_size, target_size)yield x, ydef create_time(self, start_time, target_size):'''创建预测时序target_size:预测数据大小'''tmp_start_time = datetime.datetime.strptime(start_time,'%Y/%m/%d')result = []for i in range(target_size):if tmp_start_time.weekday==4:tmp_start_time = tmp_start_time + datetime.timedelta(days=3)else:tmp_start_time = tmp_start_time + datetime.timedelta(days=1)tmp_year = (tmp_start_time.year - 2000) / 20tmp_month = tmp_start_time.month / 12tmp_day = tmp_start_time.day / 31result.append([tmp_year, tmp_month, tmp_day])result = np.array(result)return resultdef show_image(self, history_data, target_data=None):'''显示K线图history_data:(None,15)target_data:(None,15)'''all_data = history_dataif target_data is not None:all_data = np.append(history_data, target_data, axis=0)show_history_data = pandas.DataFrame({'data':[i for i in range(all_data.shape[0])],'open':all_data[:,-4],'high':all_data[:,-3],'low':all_data[:,-2],'close':all_data[:,-1]})now_close = 50for i in range(len(show_history_data)):show_history_data.loc[i,'open'] = now_close*(1+show_history_data.loc[i,'open']*0.1)show_history_data.loc[i,'high'] = now_close*(1+show_history_data.loc[i,'high']*0.1)show_history_data.loc[i,'low'] = now_close*(1+show_history_data.loc[i,'low']*0.1)now_close = now_close*(1+show_history_data.loc[i,'close']*0.1)show_history_data.loc[i,'close'] = now_close# 创建一个子图 fig, ax = plt.subplots(facecolor=(0.5, 0.5, 0.5))fig.subplots_adjust(bottom=0.2)plt.title("股票K线图")plt.xlabel("时间")plt.ylabel("股价变化(%)")all_values = show_history_data.valuesif target_data is not None:mpf.candlestick_ohlc(ax,all_values[:len(history_data)],width=0.5,colorup='r',colordown='g')mpf.candlestick_ohlc(ax,all_values[len(history_data):],width=0.5,colorup='y',colordown='b')else:mpf.candlestick_ohlc(ax,all_values,width=0.5,colorup='r',colordown='g')plt.show()