Skip to content

Commit 6f6edb9

Browse files
committed
review rec
1 parent 0e721f8 commit 6f6edb9

16 files changed

+7893
-0
lines changed

Review_REC/D-Attn/config.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""
2+
@file : config.py
3+
@time : 2024-07-16
4+
"""
5+
import argparse
6+
7+
8+
def set_args():
9+
parser = argparse.ArgumentParser(description='基于用户评论的推荐')
10+
parser.add_argument('--output_dir', type=str, default='./output', help='输入的模型保存的路径')
11+
parser.add_argument('--train_data', type=str, default='./data/train.csv', help='训练数据')
12+
parser.add_argument('--dev_data', type=str, default='./data/valid.csv', help='验证集')
13+
parser.add_argument('--word2vec_file', type=str, default='./data/glove.6B.300d.txt', help='词向量文件')
14+
15+
parser.add_argument('--batch_size', type=int, default=64, help='批次大小')
16+
parser.add_argument('--num_epochs', type=int, default=50, help='训练多少轮')
17+
18+
parser.add_argument('--lowest_review_count', type=int, default=2)
19+
parser.add_argument('--review_length', type=int, default=40)
20+
parser.add_argument('--review_count', type=int, default=10)
21+
22+
parser.add_argument('--learning_rate', type=float, default=2e-5, help='学习率')
23+
parser.add_argument('--l2_regularization', type=float, default=1e-6, help='权重衰减程度')
24+
parser.add_argument('--learning_rate_decay', type=float, default=0.99, help='学习率衰减')
25+
26+
parser.add_argument('--kernel_count', type=int, default=100, help='卷积核个数')
27+
28+
parser.add_argument('--kernel_size', type=int, default=3, help='卷积核尺寸')
29+
parser.add_argument('--dropout_prob', type=float, default=0.5, help='dropout rate')
30+
parser.add_argument('--cnn_out_dim', type=int, default=50, help='cnn的输出')
31+
32+
parser.add_argument('--logging_steps', type=int, default=5, help='每间隔几步记录一次loss变化')
33+
parser.add_argument('--seed', default=2024, type=int, help='随机种子')
34+
35+
parser.add_argument('--pointer_count', type=int, default=2)
36+
parser.add_argument('--fm_hidden', type=int, default=10)
37+
38+
parser.add_argument('--filters_num', type=int, default=100)
39+
parser.add_argument('--id_emb_size', type=int, default=32)
40+
args = parser.parse_args()
41+
return args

Review_REC/D-Attn/data_helper.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
"""
2+
@file : data_helper.py
3+
@time : 2024-07-16
4+
"""
5+
import torch
6+
import pandas as pd
7+
from config import set_args
8+
from torch.utils.data import Dataset
9+
10+
11+
args = set_args()
12+
13+
14+
def load_embedding(word2vec_file):
15+
with open(word2vec_file, encoding='utf-8') as f:
16+
word_emb = list()
17+
word_dict = dict()
18+
word_emb.append([0])
19+
word_dict['<UNK>'] = 0
20+
for line in f.readlines():
21+
tokens = line.split(' ')
22+
word_emb.append([float(i) for i in tokens[1:]])
23+
word_dict[tokens[0]] = len(word_dict)
24+
word_emb[0] = [0] * len(word_emb[1])
25+
return word_emb, word_dict
26+
27+
28+
class Review_REDataset(Dataset):
29+
def __init__(self, data_path, word_dict, retain_rui=True):
30+
self.word_dict = word_dict
31+
self.PAD_WORD_idx = self.word_dict["<UNK>"]
32+
self.retain_rui = retain_rui # 是否在最终样本中,保留user和item的公共review
33+
self.lowest_r_count = args.lowest_review_count # lowest amount of reviews wrote by exactly one user/item
34+
self.review_length = args.review_length
35+
self.review_count = args.review_count
36+
37+
df = pd.read_csv(data_path, header=None, names=['userID', 'itemID', 'review', 'rating'])
38+
df['review'] = df['review'].apply(self._review2id) # 分词->数字
39+
# print(df.head())
40+
'''
41+
userID itemID review rating
42+
0 3748 934 [366, 1780, 6381, 79575, 10268, 0, 1590, 17427... 4
43+
1 4795 2280 [3538, 1575, 9038, 1138, 0, 8391, 12971, 2685,... 5
44+
'''
45+
self.sparse_idx = set() # 暂存稀疏样本的下标,最后删除他们
46+
user_reviews = self._get_reviews(df) # 收集每个user的评论列表
47+
# print(user_reviews.size()) # torch.Size([51764, 10, 40])
48+
item_reviews = self._get_reviews(df, 'itemID', 'userID')
49+
# print(item_reviews.size()) # torch.Size([51764, 10, 40])
50+
51+
rating = torch.Tensor(df['rating'].to_list()).view(-1, 1)
52+
53+
self.user_reviews = user_reviews[[idx for idx in range(user_reviews.shape[0]) if idx not in self.sparse_idx]]
54+
self.item_reviews = item_reviews[[idx for idx in range(item_reviews.shape[0]) if idx not in self.sparse_idx]]
55+
self.rating = rating[[idx for idx in range(rating.shape[0]) if idx not in self.sparse_idx]]
56+
57+
def __getitem__(self, idx):
58+
return self.user_reviews[idx], self.item_reviews[idx], self.rating[idx]
59+
60+
def __len__(self):
61+
return self.rating.shape[0]
62+
63+
def _get_reviews(self, df, lead='userID', costar='itemID'):
64+
# 对于每条训练数据,生成用户的所有评论汇总
65+
reviews_by_lead = dict(list(df[[costar, 'review']].groupby(df[lead]))) # 每个user/item评论汇总
66+
67+
lead_reviews = []
68+
for idx, (lead_id, costar_id) in enumerate(zip(df[lead], df[costar])):
69+
# userid itemid
70+
df_data = reviews_by_lead[lead_id] # 取出lead的所有评论:DataFrame
71+
if self.retain_rui:
72+
reviews = df_data['review'].to_list() # 取lead所有评论:列表
73+
else:
74+
reviews = df_data['review'][df_data[costar] != costar_id].to_list() # 不含lead与costar的公共评论
75+
76+
if len(reviews) < self.lowest_r_count:
77+
self.sparse_idx.add(idx)
78+
reviews = self._adjust_review_list(reviews, self.review_length, self.review_count)
79+
lead_reviews.append(reviews)
80+
return torch.LongTensor(lead_reviews)
81+
82+
def _adjust_review_list(self, reviews, r_length, r_count):
83+
reviews = reviews[:r_count] + [[self.PAD_WORD_idx] * r_length] * (r_count - len(reviews)) # 评论数量固定
84+
reviews = [r[:r_length] + [0] * (r_length - len(r)) for r in reviews] # 每条评论定长
85+
return reviews
86+
87+
def _review2id(self, review):
88+
# 将一个评论字符串分词并转为数字
89+
if not isinstance(review, str):
90+
return []
91+
wids = []
92+
for word in review.split():
93+
if word in self.word_dict:
94+
wids.append(self.word_dict[word]) # 单词映射为数字
95+
else:
96+
wids.append(self.PAD_WORD_idx)
97+
return wids

Review_REC/D-Attn/model.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""
2+
@file : model.py
3+
@time : 2024-07-16
4+
"""
5+
6+
import torch
7+
import torch.nn as nn
8+
from config import set_args
9+
import torch.nn.functional as F
10+
11+
args = set_args()
12+
13+
14+
class LocalAttention(nn.Module):
15+
def __init__(self, seq_len, win_size, emb_size, filters_num):
16+
super(LocalAttention, self).__init__()
17+
self.att_conv = nn.Sequential(
18+
nn.Conv2d(1, 1, kernel_size=(win_size, emb_size), padding=((win_size-1)//2, 0)),
19+
nn.Sigmoid()
20+
)
21+
self.cnn = nn.Conv2d(1, filters_num, kernel_size=(1, emb_size))
22+
23+
def forward(self, x):
24+
# print(x.size()) # torch.Size([64, 10, 300])
25+
score = self.att_conv(x.unsqueeze(1)).squeeze(1)
26+
# print(score.size()) # torch.Size([64, 10, 1])
27+
out = x.mul(score)
28+
29+
out = out.unsqueeze(1) # torch.Size([64, 1, 10, 300])
30+
out = torch.tanh(self.cnn(out)).squeeze(3)
31+
# print(out.size()) # torch.Size([64, 100, 10])
32+
out = F.max_pool1d(out, out.size(2)).squeeze(2)
33+
# print(out.size()) # torch.Size([64, 100])
34+
return out
35+
36+
37+
class GlobalAttention(nn.Module):
38+
def __init__(self, seq_len, emb_size, filters_size=[2, 3, 4], filters_num=100):
39+
super(GlobalAttention, self).__init__()
40+
self.att_conv = nn.Sequential(
41+
nn.Conv2d(1, 1, kernel_size=(seq_len, emb_size)),
42+
nn.Sigmoid()
43+
)
44+
self.convs = nn.ModuleList([nn.Conv2d(1, filters_num, (k, emb_size)) for k in filters_size])
45+
46+
def forward(self, x):
47+
x = x.unsqueeze(1)
48+
score = self.att_conv(x)
49+
x = x.mul(score)
50+
conv_outs = [torch.tanh(cnn(x).squeeze(3)) for cnn in self.convs]
51+
conv_outs = [F.max_pool1d(out, out.size(2)).squeeze(2) for out in conv_outs]
52+
return conv_outs
53+
54+
55+
class Net(nn.Module):
56+
def __init__(self, word_emb):
57+
super(Net, self).__init__()
58+
self.embedding = nn.Embedding.from_pretrained(torch.Tensor(word_emb))
59+
emb_size = self.embedding.embedding_dim
60+
self.local_att = LocalAttention(args.review_count, win_size=5, emb_size=emb_size, filters_num=args.filters_num)
61+
self.global_att = GlobalAttention(args.review_count, emb_size=emb_size, filters_num=args.filters_num)
62+
63+
fea_dim = args.filters_num * 4
64+
self.fc = nn.Sequential(
65+
nn.Linear(fea_dim, fea_dim),
66+
nn.Dropout(0.5),
67+
nn.ReLU(),
68+
nn.Linear(fea_dim, args.id_emb_size),
69+
)
70+
self.dropout = nn.Dropout(0.5)
71+
self.reset_para()
72+
73+
def forward(self, docs):
74+
docs = self.embedding(docs) # size * 300
75+
docs = docs.sum(dim=-2) # output(batch_size, review_count, word_dim)
76+
local_fea = self.local_att(docs) # torch.Size([64, 100])
77+
78+
global_fea = self.global_att(docs)
79+
r_fea = torch.cat([local_fea]+global_fea, 1)
80+
r_fea = self.dropout(r_fea)
81+
r_fea = self.fc(r_fea)
82+
return torch.stack([r_fea], 1)
83+
84+
def reset_para(self):
85+
cnns = [self.local_att.cnn, self.local_att.att_conv[0]]
86+
for cnn in cnns:
87+
nn.init.xavier_uniform_(cnn.weight, gain=1)
88+
nn.init.uniform_(cnn.bias, -0.1, 0.1)
89+
for cnn in self.global_att.convs:
90+
nn.init.xavier_uniform_(cnn.weight, gain=1)
91+
nn.init.uniform_(cnn.bias, -0.1, 0.1)
92+
nn.init.uniform_(self.fc[0].weight, -0.1, 0.1)
93+
nn.init.uniform_(self.fc[-1].weight, -0.1, 0.1)
94+
95+
96+
class FactorizationMachine(nn.Module):
97+
def __init__(self, in_dim, k):
98+
super(FactorizationMachine, self).__init__()
99+
self.v = nn.Parameter(torch.zeros(2 * in_dim, k))
100+
self.linear = nn.Linear(2 * in_dim, 1)
101+
102+
def forward(self, x):
103+
linear_part = self.linear(x) # input shape(batch_size, in_dim), output shape(batch_size, 1)
104+
inter_part1 = torch.mm(x, self.v)
105+
inter_part2 = torch.mm(x ** 2, self.v ** 2)
106+
pair_interactions = torch.sum(inter_part1 ** 2 - inter_part2, dim=1)
107+
output = linear_part.t() + 0.5 * pair_interactions
108+
return output.view(-1, 1) # output shape(batch_size, 1)
109+
110+
111+
class D_ATTN(nn.Module):
112+
def __init__(self, word_emb):
113+
super(D_ATTN, self).__init__()
114+
self.user_net = Net(word_emb)
115+
self.item_net = Net(word_emb)
116+
self.fm = FactorizationMachine(in_dim=args.id_emb_size, k=args.fm_hidden)
117+
118+
def forward(self, user_reviews, item_reviews):
119+
u_fea = self.user_net(user_reviews)
120+
i_fea = self.item_net(item_reviews)
121+
# print(u_fea.size()) #
122+
# print(i_fea.size()) #
123+
i_fea = i_fea.squeeze(1)
124+
u_fea = u_fea.squeeze(1)
125+
# print(u_fea.size()) # torch.Size([64, 32])
126+
# print(i_fea.size()) # torch.Size([64, 32])
127+
128+
prediction = self.fm(torch.cat([u_fea, i_fea], dim=-1))
129+
return prediction

Review_REC/D-Attn/run_data_process.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
@file : run_data_process.py
3+
@time : 2024-07-16
4+
"""
5+
import argparse
6+
import os
7+
import sys
8+
import time
9+
import pandas as pd
10+
from sklearn.model_selection import train_test_split
11+
from nltk.tokenize import WordPunctTokenizer
12+
import warnings
13+
warnings.filterwarnings('ignore')
14+
15+
pd.set_option('display.max_columns', None)
16+
17+
18+
def load_vocab(path):
19+
all_vocab = []
20+
with open(path, 'r', encoding="utf8") as f:
21+
for line in f.readlines():
22+
line = line.strip()
23+
all_vocab.append(line)
24+
return all_vocab
25+
26+
27+
def process_dataset(json_path, select_cols, train_rate, csv_path):
28+
df = pd.read_json(json_path, lines=True)
29+
df = df[select_cols]
30+
df.columns = ['userID', 'itemID', 'review', 'rating']
31+
df['userID'] = df.groupby(df['userID']).ngroup() # ngroup:分配组号
32+
df['itemID'] = df.groupby(df['itemID']).ngroup()
33+
stop_words = load_vocab('./data/stopwords.txt')
34+
punctuations = load_vocab('./data/punctuations.txt')
35+
36+
df = df.drop(df[[not isinstance(x, str) or len(x) == 0 for x in df['review']]].index) # erase null reviews
37+
def clean_review(review):
38+
review = review.lower()
39+
for p in punctuations:
40+
review = review.replace(p, ' ') # replace punctuations by space
41+
review = WordPunctTokenizer().tokenize(review) # split words
42+
review = [word for word in review if word not in stop_words] # remove stop words
43+
# review = [nltk.WordNetLemmatizer().lemmatize(word) for word in review] # extract root of word
44+
return ' '.join(review)
45+
df['review'] = df['review'].apply(clean_review)
46+
train, valid = train_test_split(df, test_size=1 - train_rate, random_state=3) # split dataset including random
47+
valid, test = train_test_split(valid, test_size=0.5, random_state=4)
48+
print(f'Split and saved dataset as csv: train {len(train)}, valid {len(valid)}, test {len(test)}')
49+
# Split and saved dataset as csv: train 51764, valid 6470, test 6471
50+
print(f'Total: {len(df)} reviews, {len(df.groupby("userID"))} users, {len(df.groupby("itemID"))} items.')
51+
# Total: 64705 reviews, 5541 users, 3568 items.
52+
return train, valid, test
53+
54+
55+
if __name__ == '__main__':
56+
parser = argparse.ArgumentParser()
57+
parser.add_argument('--data_path', dest='data_path',
58+
default='./data/reviews_Digital_Music_5.json',
59+
help='Selected columns of above dataset in json format.')
60+
parser.add_argument('--select_cols', dest='select_cols', nargs='+',
61+
default=['reviewerID', 'asin', 'reviewText', 'overall'])
62+
# 'reviewerID', 'asin', 'reviewText', 'overall'
63+
# 'reviewerID' - 评论者ID 'asin' - 产品ID 'reviewText' - 评论内容 'overall' - 总体评分
64+
parser.add_argument('--train_rate', dest='train_rate', default=0.8)
65+
parser.add_argument('--save_dir', dest='save_dir', default='./music')
66+
args = parser.parse_args()
67+
train, valid, test = process_dataset(args.data_path, args.select_cols, args.train_rate, args.save_dir)
68+
train.to_csv('./data/train.csv', index=False, header=False)
69+
valid.to_csv('./data/valid.csv', index=False, header=False)
70+
test.to_csv('./data/test.csv', index=False, header=False)

0 commit comments

Comments
 (0)