2018阿里广告点击率预估模型---DIN,Tensorflow2.0代码实践( 三 )


"""
for reviewerID, hist in reviews_df.groupby('reviewerID'):
# 每个用户浏览过的物品 , 即为正样本
pos_list = hist['asin'].tolist()
max_sl = max(max_sl, len(pos_list))
# 生成负样本
def gen_neg():
neg = pos_list[0]
while neg in pos_list:
neg = random.randint(0, item_count - 1)
return neg
# 正负样本比例1:1
neg_list = [gen_neg() for i in range(len(pos_list))]
for i in range(1, len(pos_list)):
# 生成每一次的历史记录 , 即之前的浏览历史
hist = pos_list[:i]
sl = len(hist)
if i != len(pos_list) - 1:
# 保存正负样本 , 格式:用户ID , 正/负物品id , 浏览历史 , 浏览历史长度 , 标签(1/0)
train_set.append((reviewerID, pos_list[i], hist, sl, 1))
train_set.append((reviewerID, neg_list[i], hist, sl, 0))
else:
# 最后一次保存为测试集
test_set.append((reviewerID, pos_list[i], hist, sl, 1))
test_set.append((reviewerID, neg_list[i], hist, sl, 0))
# 打乱顺序
random.shuffle(train_set)
random.shuffle(test_set)
assert len(test_set) == user_count
# 写入dataset.pkl文件
with open('dataset/dataset.pkl', 'wb') as f:
pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL)
pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL)
pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL)
pickle.dump((user_count, item_count, cate_count, max_sl), f, pickle.HIGHEST_PROTOCOL)

【2018阿里广告点击率预估模型---DIN,Tensorflow2.0代码实践】模型构建
1、定义模型所需的各种层
class DIN(tf.keras.Model):
def __init__(self, user_num, item_num, cate_num, cate_list, hidden_units):
"""
:param user_num: 用户数量
:param item_num: 物品数量
:param cate_num: 物品种类数量
:param cate_list: 物品种类列表
:param hidden_units: 隐藏层单元
"""
super(DIN, self).__init__()
self.cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int32)
self.hidden_units = hidden_units
# self.user_embed = tf.keras.layers.Embedding(
#input_dim=user_num, output_dim=hidden_units, embeddings_initializer='random_uniform',
#embeddings_regularizer=tf.keras.regularizers.l2(0.01), name='user_embed')
self.item_embed = tf.keras.layers.Embedding(
input_dim=item_num, output_dim=self.hidden_units, embeddings_initializer='random_uniform',
embeddings_regularizer=tf.keras.regularizers.l2(0.01), name='item_embed')
self.cate_embed = tf.keras.layers.Embedding(
input_dim=cate_num, output_dim=self.hidden_units, embeddings_initializer='random_uniform',
embeddings_regularizer=tf.keras.regularizers.l2(0.01), name='cate_embed'
)
self.dense = tf.keras.layers.Dense(self.hidden_units)
self.bn1 = tf.keras.layers.BatchNormalization()
self.concat = tf.keras.layers.Concatenate(axis=-1)
self.att_dense1 = tf.keras.layers.Dense(80, activation='sigmoid')
self.att_dense2 = tf.keras.layers.Dense(40, activation='sigmoid')
self.att_dense3 = tf.keras.layers.Dense(1)
self.bn2 = tf.keras.layers.BatchNormalization()
self.concat2 = tf.keras.layers.Concatenate(axis=-1)
self.dense1 = tf.keras.layers.Dense(80, activation='sigmoid')
self.activation1 = tf.keras.layers.PReLU()
# self.activation1 = Dice()
self.dense2 = tf.keras.layers.Dense(40, activation='sigmoid')
self.activation2 = tf.keras.layers.PReLU()
# self.activation2 = Dice()
self.dense3 = tf.keras.layers.Dense(1, activation=None)

2、根据模型图 , 首先是对User 、 Ad的进行构建 。在该数据集中 , 需要联合Goods ID和Cate ID 。【因为User的、age信息不存在 , 并不需要进行User自身属性的】