forked from ARQMath/ARQMathCode
-
Notifications
You must be signed in to change notification settings - Fork 0
/
post_reader_record.py
146 lines (123 loc) · 5.84 KB
/
post_reader_record.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from .Entities.Post import Question, Answer
from .Entity_Parser_Record.comment_parser_record import CommentParserRecord
from .Entity_Parser_Record.post_link_parser_record import PostLinkParserRecord
from .Entity_Parser_Record.post_parser_record import PostParserRecord
from .Entity_Parser_Record.user_parser_record import UserParserRecord
from .Entity_Parser_Record.vote_parser_record import VoteParserRecord
from .Visualization.generate_html_file import HtmlGenerator
from typing import List
import argparse
class DataReaderRecord:
"""
This is the data reader class for MSE ARQMath dataset.
In the constructor, all the data is read and the related ones are linked together.
We have provided several functions as examples of how to work with this data reader.
Also if the participant will to generate the html file for a given thread (question), they can use the
get_html_pages where they specify list of questions id for which they want to get the html.
The main difference with the other DataReader is that each file is read record by record here.
"""
def __init__(self, root_file_path):
"""
This class read all the data file in MSE ARQMath Dataset. The root file of data is taken as the input
and then each of the files are read and the related data are linked together.
:param root_file_path: The root directory of MSE ARQMath Dataset.
"""
post_file_path = root_file_path + "/Posts_V1_0.xml"
badges_file_path = root_file_path + "/Badges.V1.0.xml"
comments_file_path = root_file_path + "/Comments.V1.0.xml"
votes_file_path = root_file_path + "/Votes.V1.0.xml"
users_file_path = root_file_path + "/Users.V1.0.xml"
post_links_file_path = root_file_path + "/PostLinks.V1.0.xml"
# post_file_path = root_file_path + "/Posts.xml"
# badges_file_path = root_file_path + "/Badges.xml"
# comments_file_path = root_file_path + "/Comments.xml"
# votes_file_path = root_file_path + "/Votes.xml"
# users_file_path = root_file_path + "/Users.xml"
# post_links_file_path = root_file_path + "/PostLinks.xml"
print("reading users")
self.user_parser = UserParserRecord(users_file_path, badges_file_path)
print("reading comments")
self.comment_parser = CommentParserRecord(comments_file_path)
print("reading votes")
self.vote_parser = VoteParserRecord(votes_file_path)
print("reading post links")
self.post_link_parser = PostLinkParserRecord(post_links_file_path)
print("reading posts")
self.post_parser = PostParserRecord(post_file_path, self.comment_parser.map_of_comments_for_post,
self.post_link_parser.map_related_posts,
self.post_link_parser.map_duplicate_posts,
self.vote_parser.map_of_votes, self.user_parser.map_of_user)
def get_list_of_questions_posted_in_a_year(self, year):
"""
:param year:
:return:
"""
lst_of_question = []
for question_id in self.post_parser.map_questions:
question = self.post_parser.map_questions[question_id]
if question.creation_date is None:
continue
creation_year = int(question.creation_date.split("T")[0].split("-")[0])
if creation_year == year:
lst_of_question.append(question)
return lst_of_question
def get_answers_for_question(self, question_id) -> List[Answer]:
"""
:param question_id:
:return:
"""
if question_id not in self.post_parser.map_questions:
return []
return self.post_parser.map_questions[question_id].answers
def get_user(self, user_id):
"""
:param user_id:
:return:
"""
if user_id not in self.user_parser.map_of_user:
return None
return self.user_parser.map_of_user[user_id]
def get_answers_posted_by_user(self, user_id) -> List[Answer]:
"""
:param user_id:
:return:
"""
lst_of_answers = []
for parent_id in self.post_parser.map_answers:
lst_answer = self.post_parser.map_answers[parent_id]
for answer in lst_answer:
if answer.owner_user_id is not None:
if answer.owner_user_id == user_id:
lst_of_answers.append(answer)
return lst_of_answers
def get_question_of_tag(self, tag) -> List[Question]:
"""
:param tag:
:return:
"""
lst_of_questions = []
for question_id in self.post_parser.map_questions:
question = self.post_parser.map_questions[question_id]
lst_tags = question.tags
if tag in lst_tags:
lst_of_questions.append(question)
return lst_of_questions
def get_html_pages(self, lst_of_questions_id, result_directory):
"""
:param lst_of_questions_id:
:param result_directory:
:return:
"""
HtmlGenerator.questions_to_html(lst_of_questions_id, self, result_directory)
def main():
parser = argparse.ArgumentParser(description='By setting the file path for MSE ARQMath Dataset,'
'One can iterate read the related data and go through questions')
parser.add_argument('-ds', type=str, help="File path for the MSE ARQMath Dataset.", required=True)
args = vars(parser.parse_args())
clef_home_directory_file_path = (args['ds'])
dr = DataReaderRecord(clef_home_directory_file_path)
lst_questions = dr.get_question_of_tag("calculus")
lst_answers = dr.get_answers_posted_by_user(132)
dr.get_html_pages([1, 5], "../html_files")
if __name__ == "__main__":
main()