Skip to content

Commit ed928e9

Browse files
committed
update time
1 parent f0b1ee2 commit ed928e9

File tree

8 files changed

+40
-40
lines changed

8 files changed

+40
-40
lines changed

tools/live_bench/create_dataset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@
77
dataset = LiveBench()
88
dataset.capture(websites=website, driver_kwargs={"headless": True}, screen_shoter="single_screen", shoter_kwargs={"screen_size": (1024, 1024)}, qa_generator="gpt4v", scorer="claude", checker="gemini")
99

10-
website = load_websites_from_file("/data/pufanyi/project/lmms-eval/temp/images")
11-
dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
12-
dataset.upload()
10+
# website = load_websites_from_file("/data/pufanyi/project/lmms-eval/temp/images")
11+
# dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
12+
# dataset.upload()

tools/live_bench/data_summary.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
}
2626
],
2727
"source": [
28-
"data = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")"
28+
"data = load_dataset(\"lmms-lab/LiveBench\", \"2024-08\")"
2929
]
3030
},
3131
{
@@ -298,7 +298,7 @@
298298
}
299299
],
300300
"source": [
301-
"data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-07\", split=\"test\")"
301+
"data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-08\", split=\"test\")"
302302
]
303303
},
304304
{

tools/live_bench/example.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@
293293
"text": [
294294
"Map: 100%|██████████| 9/9 [00:00<00:00, 243.32 examples/s]?, ?it/s]\n",
295295
"Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 194.00ba/s]\n",
296-
"Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00, 2.07s/it]\n"
296+
"Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00, 2.08s/it]\n"
297297
]
298298
}
299299
],
@@ -354,7 +354,7 @@
354354
"source": [
355355
"import datasets\n",
356356
"\n",
357-
"data = datasets.load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")"
357+
"data = datasets.load_dataset(\"lmms-lab/LiveBench\", \"2024-08\")"
358358
]
359359
},
360360
{
@@ -446,7 +446,7 @@
446446
}
447447
],
448448
"source": [
449-
"data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-07\")"
449+
"data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-08\")"
450450
]
451451
},
452452
{

tools/live_bench/filter.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"metadata": {},
2525
"outputs": [],
2626
"source": [
27-
"data = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")"
27+
"data = load_dataset(\"lmms-lab/LiveBench\", \"2024-08\")"
2828
]
2929
},
3030
{

tools/live_bench/live_bench/view.ipynb

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"source": [
2121
"from datasets import load_dataset\n",
2222
"\n",
23-
"dataset = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")"
23+
"dataset = load_dataset(\"lmms-lab/LiveBench\", \"2024-08\")"
2424
]
2525
},
2626
{
@@ -103,7 +103,7 @@
103103
" <td>Basic Understanding</td>\n",
104104
" <td>claude</td>\n",
105105
" <td>gemini</td>\n",
106-
" <td>2024-07-20 14:02:22</td>\n",
106+
" <td>2024-08-20 14:02:22</td>\n",
107107
" <td>single_screen</td>\n",
108108
" <td>(1024, 1024)</td>\n",
109109
" <td>10</td>\n",
@@ -121,7 +121,7 @@
121121
" <td>Deeper Implications</td>\n",
122122
" <td>claude</td>\n",
123123
" <td>gemini</td>\n",
124-
" <td>2024-07-20 14:02:22</td>\n",
124+
" <td>2024-08-20 14:02:22</td>\n",
125125
" <td>single_screen</td>\n",
126126
" <td>(1024, 1024)</td>\n",
127127
" <td>10</td>\n",
@@ -139,7 +139,7 @@
139139
" <td>Contextual Analysis</td>\n",
140140
" <td>claude</td>\n",
141141
" <td>gemini</td>\n",
142-
" <td>2024-07-20 14:02:22</td>\n",
142+
" <td>2024-08-20 14:02:22</td>\n",
143143
" <td>single_screen</td>\n",
144144
" <td>(1024, 1024)</td>\n",
145145
" <td>10</td>\n",
@@ -157,7 +157,7 @@
157157
" <td>Deeper Implications</td>\n",
158158
" <td>claude</td>\n",
159159
" <td>None</td>\n",
160-
" <td>2024-07-20 14:02:22</td>\n",
160+
" <td>2024-08-20 14:02:22</td>\n",
161161
" <td>single_screen</td>\n",
162162
" <td>(1024, 1024)</td>\n",
163163
" <td>10</td>\n",
@@ -175,7 +175,7 @@
175175
" <td>Contextual Analysis</td>\n",
176176
" <td>claude</td>\n",
177177
" <td>gemini</td>\n",
178-
" <td>2024-07-20 14:02:22</td>\n",
178+
" <td>2024-08-20 14:02:22</td>\n",
179179
" <td>single_screen</td>\n",
180180
" <td>(1024, 1024)</td>\n",
181181
" <td>8</td>\n",
@@ -211,7 +211,7 @@
211211
" <td>Contextual Analysis</td>\n",
212212
" <td>gpt4v</td>\n",
213213
" <td>gemini</td>\n",
214-
" <td>2024-07-21 20:23:39</td>\n",
214+
" <td>2024-08-21 20:23:39</td>\n",
215215
" <td>human</td>\n",
216216
" <td>None</td>\n",
217217
" <td>6</td>\n",
@@ -229,7 +229,7 @@
229229
" <td>Contextual Analysis</td>\n",
230230
" <td>gpt4v</td>\n",
231231
" <td>gemini</td>\n",
232-
" <td>2024-07-21 20:27:57</td>\n",
232+
" <td>2024-08-21 20:27:57</td>\n",
233233
" <td>human</td>\n",
234234
" <td>None</td>\n",
235235
" <td>6</td>\n",
@@ -247,7 +247,7 @@
247247
" <td>Contextual Analysis</td>\n",
248248
" <td>gpt4v</td>\n",
249249
" <td>gemini</td>\n",
250-
" <td>2024-07-21 20:27:57</td>\n",
250+
" <td>2024-08-21 20:27:57</td>\n",
251251
" <td>human</td>\n",
252252
" <td>None</td>\n",
253253
" <td>7</td>\n",
@@ -265,7 +265,7 @@
265265
" <td>Contextual Analysis</td>\n",
266266
" <td>gpt4v</td>\n",
267267
" <td>gemini</td>\n",
268-
" <td>2024-07-21 20:27:57</td>\n",
268+
" <td>2024-08-21 20:27:57</td>\n",
269269
" <td>human</td>\n",
270270
" <td>None</td>\n",
271271
" <td>6</td>\n",
@@ -283,7 +283,7 @@
283283
" <td>Basic Understanding</td>\n",
284284
" <td>gpt4v</td>\n",
285285
" <td>gemini</td>\n",
286-
" <td>2024-07-21 20:27:57</td>\n",
286+
" <td>2024-08-21 20:27:57</td>\n",
287287
" <td>human</td>\n",
288288
" <td>None</td>\n",
289289
" <td>5</td>\n",
@@ -362,17 +362,17 @@
362362
"319 Scoring Criteria (Total 10 points):\\n\\n- Ident... Basic Understanding \n",
363363
"\n",
364364
" data_generator checker date_time screen_shoter screen_size \\\n",
365-
"0 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n",
366-
"1 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n",
367-
"2 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n",
368-
"3 claude None 2024-07-20 14:02:22 single_screen (1024, 1024) \n",
369-
"4 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n",
365+
"0 claude gemini 2024-08-20 14:02:22 single_screen (1024, 1024) \n",
366+
"1 claude gemini 2024-08-20 14:02:22 single_screen (1024, 1024) \n",
367+
"2 claude gemini 2024-08-20 14:02:22 single_screen (1024, 1024) \n",
368+
"3 claude None 2024-08-20 14:02:22 single_screen (1024, 1024) \n",
369+
"4 claude gemini 2024-08-20 14:02:22 single_screen (1024, 1024) \n",
370370
".. ... ... ... ... ... \n",
371-
"315 gpt4v gemini 2024-07-21 20:23:39 human None \n",
372-
"316 gpt4v gemini 2024-07-21 20:27:57 human None \n",
373-
"317 gpt4v gemini 2024-07-21 20:27:57 human None \n",
374-
"318 gpt4v gemini 2024-07-21 20:27:57 human None \n",
375-
"319 gpt4v gemini 2024-07-21 20:27:57 human None \n",
371+
"315 gpt4v gemini 2024-08-21 20:23:39 human None \n",
372+
"316 gpt4v gemini 2024-08-21 20:27:57 human None \n",
373+
"317 gpt4v gemini 2024-08-21 20:27:57 human None \n",
374+
"318 gpt4v gemini 2024-08-21 20:27:57 human None \n",
375+
"319 gpt4v gemini 2024-08-21 20:27:57 human None \n",
376376
"\n",
377377
" score reason scorer_name \n",
378378
"0 10 The answer is correct and can be directly veri... claude \n",

tools/live_bench/refine_all_results.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55

66
if __name__ == "__main__":
7-
hf_data = load_dataset("lmms-lab/LiveBench", "2024-07", split="test")
7+
hf_data = load_dataset("lmms-lab/LiveBench", "2024-08", split="test")
88
finalizer = QuestionFinalizer()
99

1010
def load_results():
@@ -32,5 +32,5 @@ def load_results():
3232
final_data[item].append(value)
3333
# final_data = Dataset.from_generator(load_results)
3434
final_data = Dataset.from_dict(final_data, features=hf_data.features)
35-
final_data.save_to_disk("logs/2024-07-final")
36-
final_data.push_to_hub("lmms-lab/LiveBench", "2024-07")
35+
final_data.save_to_disk("logs/2024-08-final")
36+
final_data.push_to_hub("lmms-lab/LiveBench", "2024-08")

tools/live_bench/script/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@ python upload_results.py -f <log_folder> -m <model_name> [-F]
1111
Example:
1212

1313
```sh
14-
python upload_results.py -f logs/0706_0959_model_outputs_gpt4v_model_args_c974bc -m gpt-4o -F
14+
python upload_results.py -f logs/0806_0959_model_outputs_gpt4v_model_args_c974bc -m gpt-4o -F
1515
```

tools/live_bench/script/modify.ipynb

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"source": [
1818
"import datasets\n",
1919
"\n",
20-
"data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-07\")"
20+
"data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-08\")"
2121
]
2222
},
2323
{
@@ -35,7 +35,7 @@
3535
"metadata": {},
3636
"outputs": [],
3737
"source": [
38-
"df.to_csv(\"2024-07.csv\", index=False)"
38+
"df.to_csv(\"2024-08.csv\", index=False)"
3939
]
4040
},
4141
{
@@ -46,7 +46,7 @@
4646
"source": [
4747
"import pandas as pd\n",
4848
"\n",
49-
"df = pd.read_csv(\"2024-07.csv\")"
49+
"df = pd.read_csv(\"2024-08.csv\")"
5050
]
5151
},
5252
{
@@ -234,7 +234,7 @@
234234
}
235235
],
236236
"source": [
237-
"data.push_to_hub(\"lmms-lab/LiveBenchResults\", \"2024-07\", split=\"test\")"
237+
"data.push_to_hub(\"lmms-lab/LiveBenchResults\", \"2024-08\", split=\"test\")"
238238
]
239239
},
240240
{
@@ -243,7 +243,7 @@
243243
"metadata": {},
244244
"outputs": [],
245245
"source": [
246-
"data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")"
246+
"data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-08\")"
247247
]
248248
},
249249
{
@@ -426,7 +426,7 @@
426426
}
427427
],
428428
"source": [
429-
"data.push_to_hub(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")"
429+
"data.push_to_hub(\"lmms-lab/LiveBenchDetailedResults\", \"2024-08\")"
430430
]
431431
},
432432
{

0 commit comments

Comments
 (0)