From d19cdd86260c41dfe73a1283dbce9dfa8893c8f3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 23 Feb 2024 14:32:40 +0800 Subject: [PATCH] add scripts --- .../.ipynb_checkpoints/zero3-checkpoint.json | 28 + scripts/convert_gqa_for_eval.py | 18 + scripts/convert_mmbench_for_submission.py | 27 + scripts/convert_mmvet_for_eval.py | 18 + scripts/convert_seed_for_submission.py | 74 +++ scripts/convert_sqa_to_llava.py | 88 +++ scripts/convert_sqa_to_llava_base_prompt.py | 334 +++++++++++ scripts/convert_vizwiz_for_submission.py | 47 ++ scripts/convert_vqav2_for_submission.py | 56 ++ scripts/extract_mm_projector.py | 47 ++ scripts/finetune.sh | 48 ++ scripts/finetune_full_schedule.sh | 48 ++ scripts/finetune_lora.sh | 49 ++ scripts/finetune_qlora.sh | 50 ++ scripts/finetune_sqa.sh | 36 ++ scripts/merge_lora_weights.py | 22 + scripts/pretrain.sh | 46 ++ scripts/pretrain_xformers.sh | 44 ++ scripts/sqa_eval_batch.sh | 13 + scripts/sqa_eval_gather.sh | 18 + .../.ipynb_checkpoints/finetune-checkpoint.sh | 55 ++ .../finetune_lora-checkpoint.sh | 53 ++ .../.ipynb_checkpoints/pretrain-checkpoint.sh | 54 ++ .../pretrain_baselines-checkpoint.sh | 9 + .../train_baselines-checkpoint.sh | 15 + .../train_baselines_phi_jia-checkpoint.sh | 18 + scripts/tiny_llava/LOGS/LOG.md | 561 ++++++++++++++++++ scripts/tiny_llava/LOGS/REPORT.md | 12 + scripts/tiny_llava/LOGS/organized_log.md | 215 +++++++ scripts/tiny_llava/docs/LOG.md | 0 .../eval/.ipynb_checkpoints/gqa-checkpoint.sh | 50 ++ .../.ipynb_checkpoints/gqa_v1-checkpoint.sh | 46 ++ .../mmbench_cn-checkpoint.sh | 24 + .../.ipynb_checkpoints/pope-checkpoint.sh | 27 + .../.ipynb_checkpoints/pope_v1-checkpoint.sh | 25 + .../eval/.ipynb_checkpoints/sqa-checkpoint.sh | 29 + .../.ipynb_checkpoints/sqa_v1-checkpoint.sh | 28 + .../.ipynb_checkpoints/textvqa-checkpoint.sh | 25 + .../textvqa_v1-checkpoint.sh | 24 + .../.ipynb_checkpoints/vizwiz-checkpoint.sh | 29 + .../vizwiz_v1-checkpoint.sh | 28 + .../.ipynb_checkpoints/vqav2-checkpoint.sh | 56 ++ scripts/tiny_llava/eval/gqa.sh | 53 ++ scripts/tiny_llava/eval/gqa_v1.sh | 46 ++ scripts/tiny_llava/eval/imagenet.sh | 25 + scripts/tiny_llava/eval/mmbench.sh | 23 + scripts/tiny_llava/eval/mmbench_cn.sh | 24 + scripts/tiny_llava/eval/mme.sh | 27 + scripts/tiny_llava/eval/mmvet.sh | 20 + scripts/tiny_llava/eval/pope.sh | 18 + scripts/tiny_llava/eval/pope_v1.sh | 25 + scripts/tiny_llava/eval/sqa.sh | 31 + scripts/tiny_llava/eval/sqa_v1.sh | 28 + scripts/tiny_llava/eval/textvqa.sh | 30 + scripts/tiny_llava/eval/textvqa_v1.sh | 24 + scripts/tiny_llava/eval/vizwiz.sh | 24 + scripts/tiny_llava/eval/vizwiz_v1.sh | 28 + scripts/tiny_llava/eval/vqav2.sh | 55 ++ scripts/tiny_llava/finetune.sh | 56 ++ scripts/tiny_llava/finetune_lora.sh | 55 ++ scripts/tiny_llava/finetune_lora_llm_open.sh | 0 scripts/tiny_llava/finetune_lora_type3.sh | 54 ++ scripts/tiny_llava/finetune_resamplers.sh | 0 scripts/tiny_llava/finetune_type4.sh | 55 ++ scripts/tiny_llava/pretrain.sh | 55 ++ scripts/tiny_llava/pretrain_baselines.sh | 9 + scripts/tiny_llava/pretrain_llm_open.sh | 0 scripts/tiny_llava/pretrain_type3.sh | 54 ++ scripts/tiny_llava/pretrain_type4.sh | 54 ++ scripts/tiny_llava/train_baselines.sh | 31 + scripts/tiny_llava/train_baselines_phi_jia.sh | 18 + scripts/zero2.json | 23 + scripts/zero3.json | 28 + scripts/zero3_offload.json | 56 ++ 74 files changed, 3523 insertions(+) create mode 100644 scripts/.ipynb_checkpoints/zero3-checkpoint.json create mode 100644 scripts/convert_gqa_for_eval.py create mode 100644 scripts/convert_mmbench_for_submission.py create mode 100644 scripts/convert_mmvet_for_eval.py create mode 100644 scripts/convert_seed_for_submission.py create mode 100644 scripts/convert_sqa_to_llava.py create mode 100644 scripts/convert_sqa_to_llava_base_prompt.py create mode 100644 scripts/convert_vizwiz_for_submission.py create mode 100644 scripts/convert_vqav2_for_submission.py create mode 100644 scripts/extract_mm_projector.py create mode 100644 scripts/finetune.sh create mode 100644 scripts/finetune_full_schedule.sh create mode 100644 scripts/finetune_lora.sh create mode 100644 scripts/finetune_qlora.sh create mode 100644 scripts/finetune_sqa.sh create mode 100644 scripts/merge_lora_weights.py create mode 100644 scripts/pretrain.sh create mode 100644 scripts/pretrain_xformers.sh create mode 100644 scripts/sqa_eval_batch.sh create mode 100644 scripts/sqa_eval_gather.sh create mode 100644 scripts/tiny_llava/.ipynb_checkpoints/finetune-checkpoint.sh create mode 100644 scripts/tiny_llava/.ipynb_checkpoints/finetune_lora-checkpoint.sh create mode 100755 scripts/tiny_llava/.ipynb_checkpoints/pretrain-checkpoint.sh create mode 100644 scripts/tiny_llava/.ipynb_checkpoints/pretrain_baselines-checkpoint.sh create mode 100644 scripts/tiny_llava/.ipynb_checkpoints/train_baselines-checkpoint.sh create mode 100644 scripts/tiny_llava/.ipynb_checkpoints/train_baselines_phi_jia-checkpoint.sh create mode 100644 scripts/tiny_llava/LOGS/LOG.md create mode 100644 scripts/tiny_llava/LOGS/REPORT.md create mode 100644 scripts/tiny_llava/LOGS/organized_log.md create mode 100644 scripts/tiny_llava/docs/LOG.md create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/gqa-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/gqa_v1-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/mmbench_cn-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/pope-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/pope_v1-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/sqa-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/sqa_v1-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa_v1-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz_v1-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/vqav2-checkpoint.sh create mode 100644 scripts/tiny_llava/eval/gqa.sh create mode 100644 scripts/tiny_llava/eval/gqa_v1.sh create mode 100644 scripts/tiny_llava/eval/imagenet.sh create mode 100644 scripts/tiny_llava/eval/mmbench.sh create mode 100644 scripts/tiny_llava/eval/mmbench_cn.sh create mode 100644 scripts/tiny_llava/eval/mme.sh create mode 100644 scripts/tiny_llava/eval/mmvet.sh create mode 100644 scripts/tiny_llava/eval/pope.sh create mode 100644 scripts/tiny_llava/eval/pope_v1.sh create mode 100644 scripts/tiny_llava/eval/sqa.sh create mode 100644 scripts/tiny_llava/eval/sqa_v1.sh create mode 100644 scripts/tiny_llava/eval/textvqa.sh create mode 100644 scripts/tiny_llava/eval/textvqa_v1.sh create mode 100644 scripts/tiny_llava/eval/vizwiz.sh create mode 100644 scripts/tiny_llava/eval/vizwiz_v1.sh create mode 100644 scripts/tiny_llava/eval/vqav2.sh create mode 100644 scripts/tiny_llava/finetune.sh create mode 100644 scripts/tiny_llava/finetune_lora.sh create mode 100644 scripts/tiny_llava/finetune_lora_llm_open.sh create mode 100644 scripts/tiny_llava/finetune_lora_type3.sh create mode 100644 scripts/tiny_llava/finetune_resamplers.sh create mode 100644 scripts/tiny_llava/finetune_type4.sh create mode 100755 scripts/tiny_llava/pretrain.sh create mode 100644 scripts/tiny_llava/pretrain_baselines.sh create mode 100644 scripts/tiny_llava/pretrain_llm_open.sh create mode 100644 scripts/tiny_llava/pretrain_type3.sh create mode 100644 scripts/tiny_llava/pretrain_type4.sh create mode 100644 scripts/tiny_llava/train_baselines.sh create mode 100644 scripts/tiny_llava/train_baselines_phi_jia.sh create mode 100644 scripts/zero2.json create mode 100644 scripts/zero3.json create mode 100644 scripts/zero3_offload.json diff --git a/scripts/.ipynb_checkpoints/zero3-checkpoint.json b/scripts/.ipynb_checkpoints/zero3-checkpoint.json new file mode 100644 index 0000000..8ff461f --- /dev/null +++ b/scripts/.ipynb_checkpoints/zero3-checkpoint.json @@ -0,0 +1,28 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "train_micro_batch_size_per_gpu": "auto", + "train_batch_size": "auto", + "gradient_accumulation_steps": "auto", + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + } +} \ No newline at end of file diff --git a/scripts/convert_gqa_for_eval.py b/scripts/convert_gqa_for_eval.py new file mode 100644 index 0000000..18f2a8e --- /dev/null +++ b/scripts/convert_gqa_for_eval.py @@ -0,0 +1,18 @@ +import os +import json +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--src", type=str) +parser.add_argument("--dst", type=str) +args = parser.parse_args() + +all_answers = [] +for line_idx, line in enumerate(open(args.src)): + res = json.loads(line) + question_id = res['question_id'] + text = res['text'].rstrip('.').lower() + all_answers.append({"questionId": question_id, "prediction": text}) + +with open(args.dst, 'w') as f: + json.dump(all_answers, f) diff --git a/scripts/convert_mmbench_for_submission.py b/scripts/convert_mmbench_for_submission.py new file mode 100644 index 0000000..fd4b673 --- /dev/null +++ b/scripts/convert_mmbench_for_submission.py @@ -0,0 +1,27 @@ +import os +import json +import argparse +import pandas as pd + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--annotation-file", type=str, required=True) + parser.add_argument("--result-dir", type=str, required=True) + parser.add_argument("--upload-dir", type=str, required=True) + parser.add_argument("--experiment", type=str, required=True) + + return parser.parse_args() + +if __name__ == "__main__": + args = get_args() + + df = pd.read_table(args.annotation_file) + + cur_df = df.copy() + cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) + cur_df.insert(6, 'prediction', None) + for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): + pred = json.loads(pred) + cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] + + cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') diff --git a/scripts/convert_mmvet_for_eval.py b/scripts/convert_mmvet_for_eval.py new file mode 100644 index 0000000..9afaa39 --- /dev/null +++ b/scripts/convert_mmvet_for_eval.py @@ -0,0 +1,18 @@ +import os +import json +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--src", type=str) +parser.add_argument("--dst", type=str) +args = parser.parse_args() + +cur_result = {} + +for line in open(args.src): + data = json.loads(line) + qid = data['question_id'] + cur_result[f'v1_{qid}'] = data['text'] + +with open(args.dst, 'w') as f: + json.dump(cur_result, f, indent=2) diff --git a/scripts/convert_seed_for_submission.py b/scripts/convert_seed_for_submission.py new file mode 100644 index 0000000..1a87f43 --- /dev/null +++ b/scripts/convert_seed_for_submission.py @@ -0,0 +1,74 @@ +import os +import json +import argparse + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--annotation-file", type=str) + parser.add_argument("--result-file", type=str) + parser.add_argument("--result-upload-file", type=str) + return parser.parse_args() + + +def eval_single(result_file, eval_only_type=None): + results = {} + for line in open(result_file): + row = json.loads(line) + results[row['question_id']] = row + + type_counts = {} + correct_counts = {} + for question_data in data['questions']: + if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue + data_type = question_data['question_type_id'] + type_counts[data_type] = type_counts.get(data_type, 0) + 1 + try: + question_id = int(question_data['question_id']) + except: + question_id = question_data['question_id'] + if question_id not in results: + correct_counts[data_type] = correct_counts.get(data_type, 0) + continue + row = results[question_id] + if row['text'] == question_data['answer']: + correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 + + total_count = 0 + total_correct = 0 + for data_type in sorted(type_counts.keys()): + accuracy = correct_counts[data_type] / type_counts[data_type] * 100 + if eval_only_type is None: + print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") + + total_count += type_counts[data_type] + total_correct += correct_counts[data_type] + + total_accuracy = total_correct / total_count * 100 + if eval_only_type is None: + print(f"Total accuracy: {total_accuracy:.2f}%") + else: + print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") + + return results + +if __name__ == "__main__": + args = get_args() + data = json.load(open(args.annotation_file)) + ques_type_id_to_name = {id:n for n,id in data['question_type'].items()} + + results = eval_single(args.result_file) + eval_single(args.result_file, eval_only_type='image') + eval_single(args.result_file, eval_only_type='video') + + with open(args.result_upload_file, 'w') as fp: + for question in data['questions']: + qid = question['question_id'] + if qid in results: + result = results[qid] + else: + result = results[int(qid)] + fp.write(json.dumps({ + 'question_id': qid, + 'prediction': result['text'] + }) + '\n') diff --git a/scripts/convert_sqa_to_llava.py b/scripts/convert_sqa_to_llava.py new file mode 100644 index 0000000..4c9a756 --- /dev/null +++ b/scripts/convert_sqa_to_llava.py @@ -0,0 +1,88 @@ +import json +import os +import fire +import re +from convert_sqa_to_llava_base_prompt import build_prompt_chatbot + + +def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"): + split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] + problems = json.load(open(os.path.join(base_dir, "problems.json"))) + + split_problems = build_prompt_chatbot( + problems, split_indices, prompt_format, + use_caption=False, is_test=False) + + target_format = [] + for prob_id, (input, output) in split_problems.items(): + if input.startswith('Question: '): + input = input.replace('Question: ', '') + if output.startswith('Answer: '): + output = output.replace('Answer: ', '') + + raw_prob_data = problems[prob_id] + if raw_prob_data['image'] is None: + target_format.append({ + "id": prob_id, + "conversations": [ + {'from': 'human', 'value': f"{input}"}, + {'from': 'gpt', 'value': f"{output}"}, + ], + }) + + else: + target_format.append({ + "id": prob_id, + "image": os.path.join(prob_id, raw_prob_data['image']), + "conversations": [ + {'from': 'human', 'value': f"{input}\n"}, + {'from': 'gpt', 'value': f"{output}"}, + ], + }) + + print(f'Number of samples: {len(target_format)}') + + with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f: + json.dump(target_format, f, indent=2) + + +def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"): + split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] + problems = json.load(open(os.path.join(base_dir, "problems.json"))) + + split_problems = build_prompt_chatbot( + problems, split_indices, prompt_format, + use_caption=False, is_test=False) + + writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w") + for prob_id, (input, output) in split_problems.items(): + if input.startswith('Question: '): + input = input.replace('Question: ', '') + if output.startswith('Answer: '): + output = output.replace('Answer: ', '') + + raw_prob_data = problems[prob_id] + if raw_prob_data['image'] is None: + data = { + "id": prob_id, + "instruction": f"{input}", + "output": f"{output}", + } + + else: + data = { + "id": prob_id, + "image": os.path.join(prob_id, raw_prob_data['image']), + "instruction": f"{input}\n", + "output": f"{output}", + } + writer.write(json.dumps(data) + '\n') + writer.close() + + +def main(task, **kwargs): + globals()[task](**kwargs) + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/scripts/convert_sqa_to_llava_base_prompt.py b/scripts/convert_sqa_to_llava_base_prompt.py new file mode 100644 index 0000000..cd7d0ee --- /dev/null +++ b/scripts/convert_sqa_to_llava_base_prompt.py @@ -0,0 +1,334 @@ +def get_question_text(problem): + question = problem['question'] + return question + + +def get_context_text(problem, use_caption): + txt_context = problem['hint'] + img_context = problem['caption'] if use_caption else "" + context = " ".join([txt_context, img_context]).strip() + if context == "": + context = "N/A" + return context + + +def get_choice_text(probelm, options): + choices = probelm['choices'] + choice_list = [] + for i, c in enumerate(choices): + choice_list.append("({}) {}".format(options[i], c)) + choice_txt = " ".join(choice_list) + #print(choice_txt) + return choice_txt + + +def get_answer(problem, options): + return options[problem['answer']] + + +def get_lecture_text(problem): + # \\n: GPT-3 can generate the lecture with more tokens. + lecture = problem['lecture'].replace("\n", "\\n") + return lecture + + +def get_solution_text(problem): + # \\n: GPT-3 can generate the solution with more tokens + solution = problem['solution'].replace("\n", "\\n") + return solution + + +def create_one_example_chatbot(format, question, context, choice, answer, lecture, solution, test_example=True): + + input_format, output_format = format.split("-") + + ## Inputs + if input_format == "CQM": + input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n" + elif input_format == "QCM": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n" + # upper bound experiment + elif input_format == "QCML": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n" + elif input_format == "QCME": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n" + elif input_format == "QCMLE": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n" + + elif input_format == "QCLM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n" + elif input_format == "QCEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n" + elif input_format == "QCLEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n" + + # Outputs + if test_example: + output = "Answer:" + elif output_format == 'A': + output = f"Answer: The answer is {answer}." + + elif output_format == 'AL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution}" + elif output_format == 'AE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture}" + elif output_format == 'ALE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}" + elif output_format == 'AEL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}" + + elif output_format == 'LA': + output = f"Answer: {lecture} The answer is {answer}." + elif output_format == 'EA': + output = f"Answer: {solution} The answer is {answer}." + elif output_format == 'LEA': + output = f"Answer: {lecture} {solution} The answer is {answer}." + elif output_format == 'ELA': + output = f"Answer: {solution} {lecture} The answer is {answer}." + elif output_format == 'LEPA': + output = '' + if len(lecture.strip()) > 0: + output += f"LECTURE: {lecture}\n" + if len(solution.strip()) > 0: + output += f"SOLUTION: {solution}\n" + output += '###\n' + output += f"ANSWER: {answer}." + + input = input.replace(" ", " ").strip() + output = output.replace(" ", " ").strip() + if input.endswith("BECAUSE:"): + input = input.replace("BECAUSE:", "").strip() + if output.endswith("BECAUSE:"): + output = output.replace("BECAUSE:", "").strip() + return input, output + + +def create_one_example(format, question, context, choice, answer, lecture, solution, test_example=True): + + input_format, output_format = format.split("-") + + ## Inputs + if input_format == "CQM": + input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n" + elif input_format == "QCM": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n" + # upper bound experiment + elif input_format == "QCML": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n" + elif input_format == "QCME": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n" + elif input_format == "QCMLE": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n" + + elif input_format == "QCLM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n" + elif input_format == "QCEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n" + elif input_format == "QCLEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n" + + # Outputs + if test_example: + output = "Answer:" + elif output_format == 'A': + output = f"Answer: The answer is {answer}." + + elif output_format == 'AL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution}" + elif output_format == 'AE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture}" + elif output_format == 'ALE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}" + elif output_format == 'AEL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}" + + elif output_format == 'LA': + output = f"Answer: {lecture} The answer is {answer}." + elif output_format == 'EA': + output = f"Answer: {solution} The answer is {answer}." + elif output_format == 'LEA': + output = f"Answer: {lecture} {solution} The answer is {answer}." + elif output_format == 'ELA': + output = f"Answer: {solution} {lecture} The answer is {answer}." + + text = input + output + text = text.replace(" ", " ").strip() + if text.endswith("BECAUSE:"): + text = text.replace("BECAUSE:", "").strip() + return text + + + +def create_one_example_gpt4(format, question, context, choice, answer, lecture, solution, test_example=True): + + input_format, output_format = format.split("-") + + ## Inputs + if input_format == "CQM": + input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n" + elif input_format == "QCM": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n" + # upper bound experiment + elif input_format == "QCML": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n" + elif input_format == "QCME": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n" + elif input_format == "QCMLE": + input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n" + + elif input_format == "QCLM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n" + elif input_format == "QCEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n" + elif input_format == "QCLEM": + input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n" + + # Outputs + if test_example: + output = "Answer:" + elif output_format == 'A': + output = f"Answer: The answer is {answer}." + + elif output_format == 'AL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution}" + elif output_format == 'AE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture}" + elif output_format == 'ALE': + output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}" + elif output_format == 'AEL': + output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}" + + elif output_format == 'LA': + output = f"Answer: {lecture} The answer is {answer}." + elif output_format == 'EA': + output = f"Answer: {solution} The answer is {answer}." + elif output_format == 'LEA': + output = f"Answer: {lecture} {solution} The answer is {answer}." + elif output_format == 'ELA': + output = f"Answer: {solution} {lecture} The answer is {answer}." + + input = input.replace(" ", " ").strip() + output = output.replace(" ", " ").strip() + if output.endswith("BECAUSE:"): + output = output.replace("BECAUSE:", "").strip() + + user_prompt = {"role": "user", "content": f"Can you explain {input}?"} + assistant_prompt = {"role": "assistant", "content": f"{output}"} + + return user_prompt, assistant_prompt + + +def build_prompt_chatbot(problems, shot_qids, prompt_format, use_caption=False, options=["A", "B", "C", "D", "E"], is_test=False): + examples = {} + + for qid in shot_qids: + question = get_question_text(problems[qid]) + context = get_context_text(problems[qid], use_caption) + choice = get_choice_text(problems[qid], options) + answer = get_answer(problems[qid], options) + lecture = get_lecture_text(problems[qid]).replace('\\n', '\n') + solution = get_solution_text(problems[qid]).replace('\\n', '\n') + + train_example = create_one_example_chatbot(prompt_format, + question, + context, + choice, + answer, + lecture, + solution, + test_example=is_test) + examples[qid] = train_example + return examples + + +def build_prompt(problems, shot_qids, test_qid, args): + + examples = [] + + # n-shot training examples + for qid in shot_qids: + question = get_question_text(problems[qid]) + context = get_context_text(problems[qid], args.use_caption) + choice = get_choice_text(problems[qid], args.options) + answer = get_answer(problems[qid], args.options) + lecture = get_lecture_text(problems[qid]) + solution = get_solution_text(problems[qid]) + + train_example = create_one_example(args.prompt_format, + question, + context, + choice, + answer, + lecture, + solution, + test_example=False) + examples.append(train_example) + + # test example + question = get_question_text(problems[test_qid]) + context = get_context_text(problems[test_qid], args.use_caption) + choice = get_choice_text(problems[test_qid], args.options) + answer = get_answer(problems[test_qid], args.options) + lecture = get_lecture_text(problems[test_qid]) + solution = get_solution_text(problems[test_qid]) + + test_example = create_one_example(args.prompt_format, + question, + context, + choice, + answer, + lecture, + solution, + test_example=True) + examples.append(test_example) + + # create the prompt input + prompt_input = '\n\n'.join(examples) + + return prompt_input + + +def build_prompt_gpt4(problems, shot_qids, test_qid, args): + + prompt_array = [{"role": "system", "content": "You are a helpful assistant."}] + + # n-shot training examples + for qid in shot_qids: + question = get_question_text(problems[qid]) + context = get_context_text(problems[qid], args.use_caption) + choice = get_choice_text(problems[qid], args.options) + answer = get_answer(problems[qid], args.options) + lecture = get_lecture_text(problems[qid]) + solution = get_solution_text(problems[qid]) + + user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format, + question, + context, + choice, + answer, + lecture, + solution, + test_example=False) + prompt_array.append(user_prompt) + prompt_array.append(assistant_prompt) + + # test example + question = get_question_text(problems[test_qid]) + context = get_context_text(problems[test_qid], args.use_caption) + choice = get_choice_text(problems[test_qid], args.options) + answer = get_answer(problems[test_qid], args.options) + lecture = get_lecture_text(problems[test_qid]) + solution = get_solution_text(problems[test_qid]) + + user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format, + question, + context, + choice, + answer, + lecture, + solution, + test_example=True) + prompt_array.append(user_prompt) + prompt_array.append(assistant_prompt) + + return prompt_array \ No newline at end of file diff --git a/scripts/convert_vizwiz_for_submission.py b/scripts/convert_vizwiz_for_submission.py new file mode 100644 index 0000000..e43cd45 --- /dev/null +++ b/scripts/convert_vizwiz_for_submission.py @@ -0,0 +1,47 @@ +import os +import argparse +import json + +from tinyllava.eval.m4c_evaluator import EvalAIAnswerProcessor + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--annotation-file', type=str, required=True) + parser.add_argument('--result-file', type=str, required=True) + parser.add_argument('--result-upload-file', type=str, required=True) + return parser.parse_args() + + +if __name__ == '__main__': + + args = parse_args() + + os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) + + results = [] + error_line = 0 + for line_idx, line in enumerate(open(args.result_file)): + try: + results.append(json.loads(line)) + except: + error_line += 1 + results = {x['question_id']: x['text'] for x in results} + test_split = [json.loads(line) for line in open(args.annotation_file)] + split_ids = set([x['question_id'] for x in test_split]) + + print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') + + all_answers = [] + + answer_processor = EvalAIAnswerProcessor() + + for x in test_split: + assert x['question_id'] in results + all_answers.append({ + 'image': x['image'], + 'answer': answer_processor(results[x['question_id']]) + }) + + with open(args.result_upload_file, 'w') as f: + json.dump(all_answers, f) diff --git a/scripts/convert_vqav2_for_submission.py b/scripts/convert_vqav2_for_submission.py new file mode 100644 index 0000000..cedd291 --- /dev/null +++ b/scripts/convert_vqav2_for_submission.py @@ -0,0 +1,56 @@ +import os +import argparse +import json + +from tinyllava.eval.m4c_evaluator import EvalAIAnswerProcessor + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2") + parser.add_argument('--ckpt', type=str, required=True) + parser.add_argument('--split', type=str, required=True) + return parser.parse_args() + + +if __name__ == '__main__': + + args = parse_args() + + src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl') + test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl') + dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json') + os.makedirs(os.path.dirname(dst), exist_ok=True) + + results = [] + error_line = 0 + for line_idx, line in enumerate(open(src)): + try: + results.append(json.loads(line)) + except: + error_line += 1 + + results = {x['question_id']: x['text'] for x in results} + test_split = [json.loads(line) for line in open(test_split)] + split_ids = set([x['question_id'] for x in test_split]) + + print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') + + all_answers = [] + + answer_processor = EvalAIAnswerProcessor() + + for x in test_split: + if x['question_id'] not in results: + all_answers.append({ + 'question_id': x['question_id'], + 'answer': '' + }) + else: + all_answers.append({ + 'question_id': x['question_id'], + 'answer': answer_processor(results[x['question_id']]) + }) + + with open(dst, 'w') as f: + json.dump(all_answers, open(dst, 'w')) diff --git a/scripts/extract_mm_projector.py b/scripts/extract_mm_projector.py new file mode 100644 index 0000000..af7e6a6 --- /dev/null +++ b/scripts/extract_mm_projector.py @@ -0,0 +1,47 @@ +""" +This is just a utility that I use to extract the projector for quantized models. +It is NOT necessary at all to train, or run inference/serve demos. +Use this script ONLY if you fully understand its implications. +""" + + +import os +import argparse +import torch +import json +from collections import defaultdict + + +def parse_args(): + parser = argparse.ArgumentParser(description='Extract MMProjector weights') + parser.add_argument('--model-path', type=str, help='model folder') + parser.add_argument('--output', type=str, help='output file') + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + + keys_to_match = ['mm_projector'] + ckpt_to_key = defaultdict(list) + try: + model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json'))) + for k, v in model_indices['weight_map'].items(): + if any(key_match in k for key_match in keys_to_match): + ckpt_to_key[v].append(k) + except FileNotFoundError: + # Smaller models or model checkpoints saved by DeepSpeed. + v = 'pytorch_model.bin' + for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys(): + if any(key_match in k for key_match in keys_to_match): + ckpt_to_key[v].append(k) + + loaded_weights = {} + + for ckpt_name, weight_keys in ckpt_to_key.items(): + ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu') + for k in weight_keys: + loaded_weights[k] = ckpt[k] + + torch.save(loaded_weights, args.output) diff --git a/scripts/finetune.sh b/scripts/finetune.sh new file mode 100644 index 0000000..c36c3ea --- /dev/null +++ b/scripts/finetune.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! + +# Uncomment and set the following variables correspondingly to run this script: + +################## VICUNA ################## +# PROMPT_VERSION=v1 +# MODEL_VERSION="vicuna-v1-3-7b" +################## VICUNA ################## + +################## LLaMA-2 ################## +# PROMPT_VERSION="llava_llama_2" +# MODEL_VERSION="llama-2-7b-chat" +################## LLaMA-2 ################## + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero2.json \ + --model_name_or_path ./checkpoints/$MODEL_VERSION \ + --version $PROMPT_VERSION \ + --data_path ./playground/data/llava_instruct_80k.json \ + --image_folder /path/to/coco/train2017 \ + --vision_tower openai/clip-vit-large-patch14 \ + --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --bf16 True \ + --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ + --num_train_epochs 1 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 50000 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to wandb diff --git a/scripts/finetune_full_schedule.sh b/scripts/finetune_full_schedule.sh new file mode 100644 index 0000000..2ae157c --- /dev/null +++ b/scripts/finetune_full_schedule.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! + +# Uncomment and set the following variables correspondingly to run this script: + +################## VICUNA ################## +# PROMPT_VERSION=v1 +# MODEL_VERSION="vicuna-v1-3-7b" +################## VICUNA ################## + +################## LLaMA-2 ################## +# PROMPT_VERSION="llava_llama_2" +# MODEL_VERSION="llama-2-7b-chat" +################## LLaMA-2 ################## + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero2.json \ + --model_name_or_path ./checkpoints/$MODEL_VERSION \ + --version $PROMPT_VERSION \ + --data_path ./playground/data/llava_instruct_158k.json \ + --image_folder /path/to/coco/train2017 \ + --vision_tower openai/clip-vit-large-patch14 \ + --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --bf16 True \ + --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ + --num_train_epochs 3 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 50000 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to wandb diff --git a/scripts/finetune_lora.sh b/scripts/finetune_lora.sh new file mode 100644 index 0000000..0456106 --- /dev/null +++ b/scripts/finetune_lora.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! + +# Uncomment and set the following variables correspondingly to run this script: + +################## VICUNA ################## +# PROMPT_VERSION=v1 +# MODEL_VERSION="vicuna-v1-3-7b" +################## VICUNA ################## + +################## LLaMA-2 ################## +# PROMPT_VERSION="llava_llama_2" +# MODEL_VERSION="llama-2-7b-chat" +################## LLaMA-2 ################## + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero2.json \ + --lora_enable True \ + --model_name_or_path ./checkpoints/$MODEL_VERSION \ + --version $PROMPT_VERSION \ + --data_path ./playground/data/llava_instruct_80k.json \ + --image_folder /path/to/coco/train2017 \ + --vision_tower openai/clip-vit-large-patch14 \ + --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --bf16 True \ + --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ + --num_train_epochs 1 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 50000 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --lazy_preprocess True \ + --dataloader_num_workers 4 \ + --report_to wandb diff --git a/scripts/finetune_qlora.sh b/scripts/finetune_qlora.sh new file mode 100644 index 0000000..05744d9 --- /dev/null +++ b/scripts/finetune_qlora.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! + +# Uncomment and set the following variables correspondingly to run this script: + +################## VICUNA ################## +# PROMPT_VERSION=v1 +# MODEL_VERSION="vicuna-v1-3-7b" +################## VICUNA ################## + +################## LLaMA-2 ################## +# PROMPT_VERSION="llava_llama_2" +# MODEL_VERSION="llama-2-7b-chat" +################## LLaMA-2 ################## + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero2.json \ + --lora_enable True \ + --bits 4 \ + --model_name_or_path ./checkpoints/$MODEL_VERSION \ + --version $PROMPT_VERSION \ + --data_path ./playground/data/llava_instruct_80k.json \ + --image_folder /path/to/coco/train2017 \ + --vision_tower openai/clip-vit-large-patch14 \ + --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --bf16 True \ + --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ + --num_train_epochs 1 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 50000 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --lazy_preprocess True \ + --dataloader_num_workers 4 \ + --report_to wandb diff --git a/scripts/finetune_sqa.sh b/scripts/finetune_sqa.sh new file mode 100644 index 0000000..146a8cb --- /dev/null +++ b/scripts/finetune_sqa.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero2.json \ + --model_name_or_path lmsys/vicuna-13b-v1.3 \ + --version $PROMPT_VERSION \ + --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \ + --image_folder /Data/ScienceQA/data/scienceqa/images/train \ + --vision_tower openai/clip-vit-large-patch14 \ + --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --bf16 True \ + --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \ + --num_train_epochs 12 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 50000 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to wandb diff --git a/scripts/merge_lora_weights.py b/scripts/merge_lora_weights.py new file mode 100644 index 0000000..b97d8aa --- /dev/null +++ b/scripts/merge_lora_weights.py @@ -0,0 +1,22 @@ +import argparse +from tinyllava.model.builder import load_pretrained_model +from tinyllava.mm_utils import get_model_name_from_path + + +def merge_lora(args): + model_name = get_model_name_from_path(args.model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') + + model.save_pretrained(args.save_model_path) + tokenizer.save_pretrained(args.save_model_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, required=True) + parser.add_argument("--model-base", type=str, required=True) + parser.add_argument("--save-model-path", type=str, required=True) + + args = parser.parse_args() + + merge_lora(args) diff --git a/scripts/pretrain.sh b/scripts/pretrain.sh new file mode 100644 index 0000000..cb70599 --- /dev/null +++ b/scripts/pretrain.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! + +# Uncomment and set the following variables correspondingly to run this script: + +# MODEL_VERSION=vicuna-v1-3-7b +# MODEL_VERSION=llama-2-7b-chat + +########### DO NOT CHANGE ########### +########### USE THIS FOR BOTH ########### +PROMPT_VERSION=plain +########### DO NOT CHANGE ########### + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero2.json \ + --model_name_or_path ./checkpoints/$MODEL_VERSION \ + --version $PROMPT_VERSION \ + --data_path /path/to/pretrain_data.json \ + --image_folder /path/to/images \ + --vision_tower openai/clip-vit-large-patch14 \ + --tune_mm_mlp_adapter True \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --bf16 True \ + --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ + --num_train_epochs 1 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 24000 \ + --save_total_limit 1 \ + --learning_rate 2e-3 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to wandb diff --git a/scripts/pretrain_xformers.sh b/scripts/pretrain_xformers.sh new file mode 100644 index 0000000..17c6fa4 --- /dev/null +++ b/scripts/pretrain_xformers.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Uncomment and set the following variables correspondingly to run this script: + +# MODEL_VERSION=vicuna-v1-3-7b +# MODEL_VERSION=llama-2-7b-chat + +########### DO NOT CHANGE ########### +########### USE THIS FOR BOTH ########### +PROMPT_VERSION=plain +########### DO NOT CHANGE ########### + +deepspeed llava/train/train_xformers.py \ + --deepspeed ./scripts/zero2.json \ + --model_name_or_path ./checkpoints/$MODEL_VERSION \ + --version $PROMPT_VERSION \ + --data_path /path/to/pretrain_data.json \ + --image_folder /path/to/images \ + --vision_tower openai/clip-vit-large-patch14 \ + --tune_mm_mlp_adapter True \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --bf16 False \ + --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ + --num_train_epochs 1 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 24000 \ + --save_total_limit 1 \ + --learning_rate 2e-3 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to wandb diff --git a/scripts/sqa_eval_batch.sh b/scripts/sqa_eval_batch.sh new file mode 100644 index 0000000..ad857ae --- /dev/null +++ b/scripts/sqa_eval_batch.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +CHUNKS=8 +for IDX in {0..7}; do + CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \ + --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \ + --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \ + --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \ + --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --conv-mode llava_v1 & +done diff --git a/scripts/sqa_eval_gather.sh b/scripts/sqa_eval_gather.sh new file mode 100644 index 0000000..d44904d --- /dev/null +++ b/scripts/sqa_eval_gather.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +CHUNKS=8 +output_file="test_llava-13b.jsonl" + +# Clear out the output file if it exists. +> "$output_file" + +# Loop through the indices and concatenate each file. +for idx in $(seq 0 $((CHUNKS-1))); do + cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file" +done + +python llava/eval/eval_science_qa.py \ + --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \ + --result-file ./test_llava-13b.jsonl \ + --output-file ./test_llava-13b_output.json \ + --output-result ./test_llava-13b_result.json diff --git a/scripts/tiny_llava/.ipynb_checkpoints/finetune-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/finetune-checkpoint.sh new file mode 100644 index 0000000..9d68242 --- /dev/null +++ b/scripts/tiny_llava/.ipynb_checkpoints/finetune-checkpoint.sh @@ -0,0 +1,55 @@ +#!/bin/bash +if [ $# -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the arguments to variables +LLM_VERSION="$1" +VT_VERSION="$2" +DATA_PATH="$3" +IMAGE_PATH="$4" + +# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +# VT_VERSION=openai/clip-vit-base-patch16 +# DATA_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data/text_files/llava_v1_5_mix665k.json +# IMAGE_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data +VT_VARIANT="${VT_VERSION#*/}" + + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path $LLM_VERSION \ + --version v1 \ + --data_path $DATA_PATH \ + --image_folder $IMAGE_PATH\ + --vision_tower $VT_VERSION \ + --pretrain_mm_mlp_adapter ./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-pretrain/mm_projector.bin \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --image_aspect_ratio pad \ + --group_by_modality_length True \ + --fp16 True \ + --output_dir ./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune \ + --num_train_epochs 1 \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 5000 \ + --save_total_limit 1 \ + --learning_rate 1e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 30 \ + --lazy_preprocess True \ + --report_to wandb \ + --run_name llava-finetune-tinyllama1.1B-${VT_VARIANT} diff --git a/scripts/tiny_llava/.ipynb_checkpoints/finetune_lora-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/finetune_lora-checkpoint.sh new file mode 100644 index 0000000..ad669f0 --- /dev/null +++ b/scripts/tiny_llava/.ipynb_checkpoints/finetune_lora-checkpoint.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +if [ $# -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the arguments to variables +LLM_VERSION="$1" +VT_VERSION="$2" +DATA_PATH="$3" +IMAGE_PATH="$4" + + +VT_VARIANT="${VT_VERSION#*/}" +LLM_VARIANT="${LLM_VERSION#*/}" + +deepspeed llava/train/train_mem.py \ + --lora_enable True --lora_r 128 --lora_alpha 256 \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path ./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain \ + --version tiny_llama \ + --data_path $DATA_PATH \ + --image_folder $IMAGE_PATH \ + --vision_tower $VT_VERSION \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --image_aspect_ratio pad \ + --group_by_modality_length True \ + --fp16 True \ + --output_dir ./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora \ + --num_train_epochs 1 \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 5000 \ + --save_total_limit 1 \ + --learning_rate 1e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 30 \ + --lazy_preprocess True \ + --report_to wandb \ + --run_name tiny-llava-v1.5-finetune-lora-${LLM_VARIANT}-${VT_VARIANT} diff --git a/scripts/tiny_llava/.ipynb_checkpoints/pretrain-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/pretrain-checkpoint.sh new file mode 100755 index 0000000..9a6dc60 --- /dev/null +++ b/scripts/tiny_llava/.ipynb_checkpoints/pretrain-checkpoint.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +if [ $# -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the arguments to variables +LLM_VERSION="$1" +VT_VERSION="$2" +DATA_PATH="$3" +IMAGE_PATH="$4" + +echo "$VT_VERSION" +# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +# VT_VERSION=openai/clip-vit-base-patch16 +# DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json +# IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images +VT_VARIANT="${VT_VERSION#*/}" + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero2.json \ + --model_name_or_path $LLM_VERSION \ + --version plain \ + --data_path $DATA_PATH\ + --image_folder $IMAGE_PATH \ + --vision_tower $VT_VERSION \ + --mm_projector_type mlp2x_gelu \ + --tune_mm_mlp_adapter True \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --fp16 True \ + --output_dir ./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-pretrain \ + --num_train_epochs 1 \ + --per_device_train_batch_size 32 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 2400 \ + --save_total_limit 1 \ + --learning_rate 1e-3 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 30 \ + --lazy_preprocess True \ + --report_to wandb \ + --run_name llava-pretrain-tinyllama1.1B-${VT_VARIANT} diff --git a/scripts/tiny_llava/.ipynb_checkpoints/pretrain_baselines-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/pretrain_baselines-checkpoint.sh new file mode 100644 index 0000000..9179c8f --- /dev/null +++ b/scripts/tiny_llava/.ipynb_checkpoints/pretrain_baselines-checkpoint.sh @@ -0,0 +1,9 @@ +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +VT_VERSIONS=(openai/clip-vit-large-patch14-336 openai/clip-vit-large-patch14 openai/clip-vit-base-patch16 openai/clip-vit-large-patch32) +DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json +IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images + + +for VT_VERSION in "${VT_VERSIONS[@]}"; do + bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" +done \ No newline at end of file diff --git a/scripts/tiny_llava/.ipynb_checkpoints/train_baselines-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/train_baselines-checkpoint.sh new file mode 100644 index 0000000..c01c2a5 --- /dev/null +++ b/scripts/tiny_llava/.ipynb_checkpoints/train_baselines-checkpoint.sh @@ -0,0 +1,15 @@ +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +VT_VERSIONS=(openai/clip-vit-large-patch14 openai/clip-vit-base-patch16) +DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json +FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/llava_v1_5_mix665k.json +IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images +FINETUNE_IMAGE_PATH=/root/autodl-tmp/data + +# bash scripts/tiny_llava/finetune.sh "$LLM_VERSION" openai/clip-vit-large-patch14-336 "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" +for VT_VERSION in "${VT_VERSIONS[@]}"; do + # bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" + bash scripts/tiny_llava/finetune.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" +done + +bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" openai/clip-vit-base-patch32 "$DATA_PATH" "$IMAGE_PATH" +bash scripts/tiny_llava/finetune.sh "$LLM_VERSION" openai/clip-vit-base-patch32 "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" \ No newline at end of file diff --git a/scripts/tiny_llava/.ipynb_checkpoints/train_baselines_phi_jia-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/train_baselines_phi_jia-checkpoint.sh new file mode 100644 index 0000000..d85efef --- /dev/null +++ b/scripts/tiny_llava/.ipynb_checkpoints/train_baselines_phi_jia-checkpoint.sh @@ -0,0 +1,18 @@ +LLM_VERSION=susnato/phi-1_5_dev +# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +VT_VERSIONS=(openai/clip-vit-large-patch14-336) +#DATA_PATH=/root/autodl-tmp/data/text_files/really_cleaned_share-captioner_coco_lcs_sam_1246k_1107.json +DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json +#FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/cleaned_sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json +FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/llava_v1_5_mix665k.json +#IMAGE_PATH=/root/autodl-tmp/data/ +#FINETUNE_IMAGE_PATH=/root/autodl-tmp/data +IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images +FINETUNE_IMAGE_PATH=/root/autodl-tmp/data + + +for VT_VERSION in "${VT_VERSIONS[@]}"; do + bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" + bash scripts/tiny_llava/finetune_lora.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" +done + diff --git a/scripts/tiny_llava/LOGS/LOG.md b/scripts/tiny_llava/LOGS/LOG.md new file mode 100644 index 0000000..2a81851 --- /dev/null +++ b/scripts/tiny_llava/LOGS/LOG.md @@ -0,0 +1,561 @@ +# 实验1:unlock-vit-from-12-tune-entire-model +## 实验时间:2024年1月30日23点10分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain lr&batch size: 2e-5 256 + * finetune lr&batch size: 2e-5 128 + * data: standard llava-1.5 + * data type: fp16 +## 训练策略: +预训练:将ViT, MLP和LLM同时打开,跟随ShareGPT4V的论文,ViT从第12层开始打开 +微调:与LLaVA一致 +## 实验结果: + * GQA: 58.28 + * SQA: 57.06 + * TextVQA: 43.17 + * VQAv2: 74.02 + * VizWiz: + * MMVet: + * POPE: adversarial: 0.835 random: 0.876 popular: 0.869 +## 实验分析: +本次实验中TextVQA和baseline(46.37)的效果差很多,我认为有可能是因为微调CLIP使CLIP的泛化性受到损伤,而TextVQA这个任务是非常细粒度的任务,导致效果减少最大。如果要提升效果,应当从更好的数据(ShareGPT4V尝试) + +# 实验2:unlock-vit-from-18-tune-entire-model +## 实验时间:2024年1月31日13点37分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 18 + * pretrain lr&batch size: 2e-5 256 + * finetune lr&batch size: 2e-5 128 + * data: standard llava-1.5 + * data type: fp16 +## 训练策略: +预训练:将ViT, MLP和LLM同时打开,ViT从第18层开始打开 +微调:与LLaVA一致 +## 实验结果: + * GQA: 58.32 + * SQA: 54.24 + * TextVQA: 43.44 + * VQAv2: 73.89 + * VizWiz: + * POPE: adversarial: 0.840 random: 0.876 popular: 0.870 +## 实验分析: +本次实验中TextVQA和SQA与baseline(46.37, 59.4)的效果差很多,和上组实验的分析相同,应该是实验数据对CLIP的泛化性损伤了。 + +# 实验3:unlock-vit-from-21-tune-entire-model +## 实验时间:2024年1月30日13点37分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 21 + * pretrain lr&batch size: 2e-5 256 + * finetune lr&batch size: 2e-5 128 + * data: standard llava-1.5 + * data type: fp16 +## 训练策略: +预训练:将ViT, MLP和LLM同时打开,ViT从第21层开始打开 +微调:与LLaVA一致 +## 实验结果: + * GQA: 58.17 + * SQA: 58.25 + * TextVQA: 43.93 + * VQAv2: + * VizWiz: + * POPE: adversarial: 0.838 random: 0.875 popular: 0.867 +## 实验分析: +SQA的表现与从18打开相比略好,但12, 18, 21之间没有观察到可见规律,需要看看第15层打开时什么情况 + +# 实验4:standard-llava-transformers-4.36.1 +## 实验时间:2024年2月1日13点37分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: No + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: standard llava-1.5 + * data type: fp16 +## 训练策略: +预训练:与LLaVA一致 +微调:与LLaVA一致 +## 实验结果: + * GQA: 58.05 + * SQA: 60.24 + * TextVQA: 45.83 + * VQAv2: + * VizWiz: +## 实验分析: +这个仅是baseline在transformer版本升级后的复现,不应该有什么变化,但是由于tokenizers版本的升级,会出现mismatch,更新train.py的代码后(use_fast=True或使用LLaVA-1.6的补丁)可以兼容升级。 +TextVQA的成绩略有下降 + +# 实验5:sharegpt4v-unlock-vit-from-18-tune-entire-model +## 实验时间:2024年2月2日10点40分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 18 + * mm_mlp_pretrain: standard-llava-transformers-4.36.1's pretrain mlp + * pretrain lr&batch size: 2e-5 256 (2 gradient accumulation steps) + * finetune lr&batch size: 2e-5 128 + * data: sharegpt4v + * data type: fp16 +## 训练策略: +预训练:MLP使用standard-llava-transformers-4.36.1初始化,在sharegpt4v的pretrain数据上对齐 +微调:与ShareGPT4V一致 +## 实验结果: + * GQA: 59.43 + * SQA: 58.7 + * TextVQA: 48.22 + * VQAv2: + * VizWiz: +## 实验分析: +总是报OOM的错,不得不把gradient accumulation step调为2。实验时间很长全部完成大约需要6至7小时。忘记上传sharetext_vqa数据集,现已上传。 +这次实验在总体上效果有一些提升,特别是TextVQA部分,提升显著,这与训练数据有强相关性,这次实验说明,想要训练更好的模型,不光要从参数量考虑,也要从数据质量考虑。但是让我感到奇怪的是,为什么在预训练过程中loss会降到这么低(低于0.5)?需要进一步探究。 + +# 实验6:sharegpt4v-unlock-vit-from-12-tune-entire-model +## 实验时间:2024年2月2日20点00分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * mm_mlp_pretrain: standard-llava-transformers-4.36.1's pretrain mlp + * pretrain lr&batch size: 2e-5 256 (2 gradient accumulation steps) + * finetune lr&batch size: 2e-5 128 + * data: sharegpt4v + * data type: fp16 +## 训练策略: +预训练:MLP使用standard-llava-transformers-4.36.1初始化,在sharegpt4v的pretrain数据上对齐 +微调:与ShareGPT4V一致 +## 实验结果: + * GQA: 59.43 + * SQA: 58.80 + * TextVQA: 48.05 + * VQAv2: 75.24 + * VizWiz: 34.74 + * MMVet: 25.1 + * POPE: adversarial: 0.839 random: 0.880 popular: 0.858 +## 实验分析: +ShareGPT4V的论文声称,将ViT从第12层打开能够取得最好的效果,这个实验是对该结论的验证。 + +# 实验7:sharegpt4v-unlock-vit-from-15-tune-entire-model +## 实验时间:2024年2月2日20点00分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 15 + * mm_mlp_pretrain: standard-llava-transformers-4.36.1's pretrain mlp + * pretrain lr&batch size: 2e-5 256 (2 gradient accumulation steps) + * finetune lr&batch size: 2e-5 128 + * data: sharegpt4v + * data type: fp16 +## 训练策略: +预训练:MLP使用standard-llava-transformers-4.36.1初始化,在sharegpt4v的pretrain数据上对齐 +微调:与ShareGPT4V一致 +## 实验结果: + * GQA: 59.43 + * SQA: 58.95 + * TextVQA: 48.18 + * VQAv2: 75.23 + * VizWiz: + * MMVet: 24 + * POPE: adversarial: 0.840 random: 0.880 popular: 0.860 +## 实验分析: +消融实验,与从第12层,18层组成一组消融实验,有时间还可以做21层 + +# 实验8:moe-mlp-unlock-vit-from-12-tune-entire-model +## 实验时间:2024年2月2日20点00分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain: 使用sharegpt4v-unlock-vit-from-12-tune-entire-model作为初始化 + * finetune lr&batch size: 2e-5 128 + * data: sharegpt4v + * data type: fp16 +## 训练策略: +微调:与ShareGPT4V一致 +## 实验结果: + * GQA: + * SQA: + * TextVQA: + * VQAv2: + * VizWiz: +## 实验分析: +FAILED + + +# 实验9:stablelm-standard-data +## 实验时间:2024年2月3日22点29分 +## 实验重要参数: + * LLM: stabilityai/stablelm-2-zephyr-1_6b + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: No + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: standard llava-1.5 + * data type: bf16 +## 训练策略: +预训练:与LLaVA-1.5一致 +微调:与LLaVA-1.5一致 +## 实验结果: + * GQA: 58.86 + * SQA: 62.82 + * TextVQA: 49.52 + * VQAv2: 74.9 + * VizWiz: + * MMVet: 25.0 + * POPE: adversarial: 0.840 random: 0.872 popular: 0.863 +## 实验分析: + + +# 实验10:stablelm-sharegpt4v-unlock-vit-from-12 +## 实验时间:2024年2月4日14点02分 +## 实验重要参数: + * LLM: stabilityai/stablelm-2-zephyr-1_6b + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain lr&batch size: 2e-5 256 + * finetune lr&batch size: 2e-5 128 + * data: sharegpt4v + * data type: bf16 +## 训练策略: +预训练:与ShareGPT4V一致 +微调:与ShareGPT4V一致 +## 实验结果: + * GQA: 60.26 + * SQA: 63.06 + * TextVQA: 51.6 + * VQAv2: 76.34 + * VizWiz: 36.34 + * MMVet: 29.3 + * POPE: adversarial: 0.844 random: 0.864 popular: 0.855 +## 实验分析: + + +# 实验11:stablelm-sharegpt4v +## 实验时间:2024年2月6日16点30分 +## 实验重要参数: + * LLM: stabilityai/stablelm-2-zephyr-1_6b + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: No + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: sharegpt4v + * data type: bf16 +## 训练策略: +预训练:与LLaVA-1.5一致 +微调:与ShareGPT4V一致 +## 实验结果: + * GQA: 59.67 + * SQA: 63.41 + * TextVQA: 50.38 + * VQAv2: 75.89 + * VizWiz: + * MMVet: 27.4 + * POPE: adversarial: 0.847 random: 0.878 popular: 0.869 +## 实验分析: + + +# 实验12:minicpm-standard-data +## 实验时间:2024年2月9日14点45分 +## 实验重要参数: + * LLM: openbmb/MiniCPM-2B-dpo-bf16 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: No + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: standard-data + * data type: bf16 +## 训练策略: +预训练:与LLaVA-1.5一致 +微调:与LLaVA-1.5一致 +## 实验结果: + * GQA: + * SQA: + * TextVQA: + * VQAv2: + * VizWiz: +## 实验分析: +FAILED + +# 实验13:minicpm-sharegpt4v-unlock-vit-from-12 +## 实验时间:2024年2月9日14点45分 +## 实验重要参数: + * LLM: openbmb/MiniCPM-2B-dpo-bf16 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: sharegpt4v + * data type: bf16 +## 训练策略: +预训练:与ShareGPT4V +微调:与ShareGPT4V +## 实验结果: + * GQA: + * SQA: + * TextVQA: + * VQAv2: + * VizWiz: +## 实验分析: +FAILED + +# 实验14:tinyllama-standard-data-siglip +## 实验时间:2024年2月9日14点45分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: No + * Unlock ViT From: None + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: LLaVA-1.5 + * data type: fp16 +## 训练策略: +预训练:LLaVA-1.5 +微调:LLaVA-1.5 +## 实验结果: + * GQA: 58.63 + * SQA: 60.24 + * TextVQA: 49.06 + * VQAv2: 75.8 + * VizWiz: + * MMVet: 24.1 + * POPE: adversarial: 0.847 random: 0.875 popular: 0.862 +## 实验分析: +本次实验将CLIP替换成了SigLip, SigLip有729个visual tokens(分辨率为384), 似乎效果提升了?需要进一步检验其他效果。现在必须确定有哪些语言模型和视觉模型值得进一步实验, +我认为应该确立是TinyLlama, StableLM, 和Phi,但Phi一直没有训练成功,为了保证效率,应该先训练TinyLlama和StableLM的四个版本,预计需要24小时 + +# 实验15:phi-standard-data-siglip-lora +## 实验时间:2024年2月9日14点45分 +## 实验重要参数: + * LLM: microsoft/phi-2 + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: Yes + * Unlock ViT From: None + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-4 128 LoRA & 2e-5 mlp + * data: LLaVA-1.5 + * data type: fp16 +## 训练策略: +预训练:LLaVA-1.5 +微调:LLaVA-1.5 +## 实验结果: + * GQA: 58.64 + * SQA: 67.13 + * TextVQA: 49.96 + * VQAv2: + * VizWiz: + * MMVet: + * POPE: +## 实验分析: + + +# 实验16:stablelm-standard-data-siglip +## 实验时间:2024年2月13日10点02分 +## 实验重要参数: + * LLM: stabilityai/stablelm-2-zephyr-1_6b + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: No + * Unlock ViT From: None + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: LLaVA-1.5 + * data type: bf16 +## 训练策略: +预训练:LLaVA-1.5 +微调:LLaVA-1.5 +## 实验结果: + * GQA: 61.13 + * SQA: 62.77 + * TextVQA: 54.09 + * VQAv2: 78.14 + * VizWiz + * MMVet: 29.5 + * POPE: adversarial: 0.853 random: 0.880 popular: 0.874 +## 实验分析: + + +# 实验17:stablelm-sharegpt4v-unlock-vit-from-12-siglip +## 实验时间:2024年2月13日10点02分 +## 实验重要参数: + * LLM: stabilityai/stablelm-2-zephyr-1_6b + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: ShareGPT4V + * data type: bf16 +## 训练策略: +预训练:ShareGPT4V +微调:ShareGPT4V +## 实验结果: + * GQA: 61.93 + * SQA: 64.70 + * TextVQA: 56.39 + * VQAv2: 78.91 + * VizWiz: + * MMVet: 32.6 + * POPE: adversarial: 0.851 random: 0.878 popular: 0.867 +## 实验分析: + + +# 实验18:tinyllama-sharegpt4v-unlock-vit-from-12-siglip +## 实验时间:2024年2月9日14点45分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: ShareGPT4V + * data type: fp16 +## 训练策略: +预训练:ShareGPT4V +微调:ShareGPT4V +## 实验结果: + * GQA: 60.25 + * SQA: 60.14 + * TextVQA: 51.68 + * VQAv2: 76.89 + * VizWiz: + * MMVet: 25.8 + * POPE: adversarial: 0.847 random: 0.875 popular: 0.862 +## 实验分析: + +# 实验19:phi-standard-data-siglip +## 实验时间:2024年2月15日14点45分 +## 实验重要参数: + * LLM: microsoft/phi-2 + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: No + * Unlock ViT From: None + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: LLaVA-1.5 + * data type: fp16 +## 训练策略: +预训练:LLaVA-1.5 +微调:LLaVA-1.5 +## 实验结果: + * GQA: 61.34 + * SQA: 69.91 + * TextVQA: 55.64 + * VQAv2: 79.2 + * VizWiz: 38.45 + * MMVet: 32.1 + * POPE: adversarial: 0.857 random: 0.885 popular: 0.871 + * LLaVAW: 67.9 +## 实验分析: + +# 实验20:phi-sharegpt4v-unlock-vit-from-12-siglip +## 实验时间:2024年2月16日14点45分 +## 实验重要参数: + * LLM: microsoft/phi-2 + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: ShareGPT4V + * data type: fp16 +## 训练策略: +预训练:ShareGPT4V +微调:ShareGPT4V +## 实验结果: + * GQA: 61.97 + * SQA: 69.06 + * TextVQA: 59.13 + * VQAv2: 79.93 + * VizWiz: 34.42? weird + * MMVet: 32.0 + * POPE: adversarial: 0.856 random: 0.873 popular: 0.863 + * LLaVAW: 75.8 +## 实验分析: + +# 实验21:phi-standard-data +## 实验时间:2024年2月17日14点45分 +## 实验重要参数: + * LLM: microsoft/phi-2 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: No + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: LLaVA-1.5 + * data type: fp16 +## 训练策略: +预训练:LLaVA-1.5 +微调:LLaVA-1.5 +## 实验结果: + * GQA: + * SQA: + * TextVQA: + * VQAv2: + * VizWiz: +## 实验分析: + + +# 实验22:phi-sharegpt4v-unlock-vit-from-12 +## 实验时间:2024年2月17日14点45分 +## 实验重要参数: + * LLM: microsoft/phi-2 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: ShareGPT4V + * data type: fp16 +## 训练策略: +预训练:ShareGPT4V +微调:ShareGPT4V +## 实验结果: + * GQA: + * SQA: + * TextVQA: + * VQAv2: + * VizWiz: +## 实验分析: + + diff --git a/scripts/tiny_llava/LOGS/REPORT.md b/scripts/tiny_llava/LOGS/REPORT.md new file mode 100644 index 0000000..b47ae4e --- /dev/null +++ b/scripts/tiny_llava/LOGS/REPORT.md @@ -0,0 +1,12 @@ +# 1. 数据对模型效果的影响 +我们训练了v1和v1.1的版本,主要的区别就是v1版本使用的是LLaVA-1.5提供的数据,而v1.1版本使用的是ShareGPT4V的数据 +而使用ShareGPT4V数据的质量优于LLaVA-1.5,也因此有了更好的效果 + +# 2. 中间的连接器对模型效果的影响 +我们尝试了MLP和Resampler两种方式,该部分主要比较两种方式的实现与对应的效果 + +# 3. 微调模型不同部分对模型效果的影响 +这部分主要探究:是否打开CLIP训练, LoRA该微调哪些层,打开LLM是全量还是微调特定层(如MLP),描述该部分的消融实验 + +# 4. 可能有时间探究,可能没有时间探究 +如果有时间,探究MoE和动态调整visual token的数量 diff --git a/scripts/tiny_llava/LOGS/organized_log.md b/scripts/tiny_llava/LOGS/organized_log.md new file mode 100644 index 0000000..494328f --- /dev/null +++ b/scripts/tiny_llava/LOGS/organized_log.md @@ -0,0 +1,215 @@ +# V1 +# 实验1:tinyllama-standard-data +## 实验时间:2024年2月1日13点37分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: No + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: standard llava-1.5 + * data type: fp16 +## 训练策略: +预训练:与LLaVA一致 +微调:与LLaVA一致 +## 实验结果: + * GQA: 58.05 + * SQA: 60.24 + * TextVQA: 45.83 + * VQAv2: + * VizWiz: +## 实验分析: +这个仅是baseline在transformer版本升级后的复现,不应该有什么变化,但是由于tokenizers版本的升级,会出现mismatch,更新train.py的代码后(use_fast=True或使用LLaVA-1.6的补丁)可以兼容升级。 +TextVQA的成绩略有下降 + +# 实验2:stablelm-standard-data +## 实验时间:2024年2月3日22点29分 +## 实验重要参数: + * LLM: stabilityai/stablelm-2-zephyr-1_6b + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: No + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: standard llava-1.5 + * data type: bf16 +## 训练策略: +预训练:与LLaVA-1.5一致 +微调:与LLaVA-1.5一致 +## 实验结果: + * GQA: 58.86 + * SQA: 62.82 + * TextVQA: 49.52 + * VQAv2: 74.9 + * VizWiz: +## 实验分析: + +# 实验3:tinyllama-standard-data-siglip +## 实验时间:2024年2月9日14点45分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: No + * Unlock ViT From: None + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: LLaVA-1.5 + * data type: fp16 +## 训练策略: +预训练:LLaVA-1.5 +微调:LLaVA-1.5 +## 实验结果: + * GQA: 58.63 + * SQA: 60.24 + * TextVQA: 49.06 + * VQAv2: + * VizWiz: +## 实验分析: +本次实验将CLIP替换成了SigLip, SigLip有729个visual tokens(分辨率为384), 似乎效果提升了?需要进一步检验其他效果。现在必须确定有哪些语言模型和视觉模型值得进一步实验, +我认为应该确立是TinyLlama, StableLM, 和Phi,但Phi一直没有训练成功,为了保证效率,应该先训练TinyLlama和StableLM的四个版本,预计需要24小时 + +# 实验4:stablelm-standard-data-siglip +## 实验时间:2024年2月13日10点02分 +## 实验重要参数: + * LLM: stabilityai/stablelm-2-zephyr-1_6b + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: No + * Unlock ViT From: None + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: LLaVA-1.5 + * data type: bf16 +## 训练策略: +预训练:LLaVA-1.5 +微调:LLaVA-1.5 +## 实验结果: + * GQA: 61.13 + * SQA: 62.77 + * TextVQA: 54.09 + * VQAv2: + * VizWiz: +## 实验分析: + +# 实验5:phi-standard-data-siglip +## 实验时间:2024年2月9日14点45分 +## 实验重要参数: + * LLM: microsoft/phi-2 + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: Yes + * Unlock ViT From: None + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-4 128 LoRA & 2e-5 mlp + * data: LLaVA-1.5 + * data type: fp16 +## 训练策略: +预训练:LLaVA-1.5 +微调:LLaVA-1.5 +## 实验结果: + * GQA: 58.64 + * SQA: 67.13 + * TextVQA: 49.96 + * VQAv2: + * VizWiz: +## 实验分析: + +# V1.1 +# 实验6:tinyllama-sharegpt4v-unlock-vit-from-12 +## 实验时间:2024年2月2日20点00分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * mm_mlp_pretrain: standard-llava-transformers-4.36.1's pretrain mlp + * pretrain lr&batch size: 2e-5 256 (2 gradient accumulation steps) + * finetune lr&batch size: 2e-5 128 + * data: sharegpt4v + * data type: fp16 +## 训练策略: +预训练:MLP使用standard-llava-transformers-4.36.1初始化,在sharegpt4v的pretrain数据上对齐 +微调:与ShareGPT4V一致 +## 实验结果: + * GQA: 59.43 + * SQA: 58.80 + * TextVQA: 48.05 + * VQAv2: 75.24 + * VizWiz: 34.74 +## 实验分析: +ShareGPT4V的论文声称,将ViT从第12层打开能够取得最好的效果,这个实验是对该结论的验证。 + + +# 实验7:stablelm-sharegpt4v-unlock-vit-from-12 +## 实验时间:2024年2月4日14点02分 +## 实验重要参数: + * LLM: stabilityai/stablelm-2-zephyr-1_6b + * VT: openai/clip-vit-large-patch14-336 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain lr&batch size: 2e-5 256 + * finetune lr&batch size: 2e-5 128 + * data: sharegpt4v + * data type: bf16 +## 训练策略: +预训练:与ShareGPT4V一致 +微调:与ShareGPT4V一致 +## 实验结果: + * GQA: 60.26 + * SQA: 63.06 + * TextVQA: 51.6 + * VQAv2: 76.34 + * VizWiz: 36.34 +## 实验分析: + +# 实验8:tinyllama-sharegpt4v-unlock-vit-from-12-siglip +## 实验时间:2024年2月9日14点45分 +## 实验重要参数: + * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: ShareGPT4V + * data type: fp16 +## 训练策略: +预训练:ShareGPT4V +微调:ShareGPT4V +## 实验结果: + * GQA: 60.25 + * SQA: 60.14 + * TextVQA: 51.68 + * VQAv2: + * VizWiz: +## 实验分析: + +# 实验9:stablelm-sharegpt4v-unlock-vit-from-12-siglip +## 实验时间:2024年2月13日10点02分 +## 实验重要参数: + * LLM: stabilityai/stablelm-2-zephyr-1_6b + * VT: google/siglip-so400m-patch14-384 + * CM: MLP + * LoRA: No + * Unlock ViT From: 12 + * pretrain lr&batch size: 1e-3 256 + * finetune lr&batch size: 2e-5 128 + * data: ShareGPT4V + * data type: bf16 +## 训练策略: +预训练:ShareGPT4V +微调:ShareGPT4V +## 实验结果: + * GQA: 61.93 + * SQA: 64.70 + * TextVQA: 56.39 + * VQAv2: + * VizWiz: +## 实验分析: diff --git a/scripts/tiny_llava/docs/LOG.md b/scripts/tiny_llava/docs/LOG.md new file mode 100644 index 0000000..e69de29 diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa-checkpoint.sh new file mode 100644 index 0000000..9777ba8 --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa-checkpoint.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +gpu_list="${CUDA_VISIBLE_DEVICES:-0}" +IFS=',' read -ra GPULIST <<< "$gpu_list" + +CHUNKS=${#GPULIST[@]} + +SPLIT="llava_gqa_testdev_balanced" +GQADIR="/root/autodl-tmp/data/eval/gqa/" + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-base-patch32 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +for IDX in $(seq 0 $((CHUNKS-1))); do + CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/gqa/$SPLIT.jsonl \ + --image-folder $EVAL_DIR/gqa/images \ + --answers-file $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --temperature 0 \ + --model-base $MODEL_BASE \ + --conv-mode tiny_llama & +done + +wait + +output_file=$EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +# Loop through the indices and concatenate each file. +for IDX in $(seq 0 $((CHUNKS-1))); do + cat $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file" +done + +python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json + +cd $GQADIR +python eval/eval.py --tier testdev_balanced diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa_v1-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa_v1-checkpoint.sh new file mode 100644 index 0000000..3425efc --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa_v1-checkpoint.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +gpu_list="${CUDA_VISIBLE_DEVICES:-0}" +IFS=',' read -ra GPULIST <<< "$gpu_list" + +CHUNKS=${#GPULIST[@]} + +SPLIT="llava_gqa_testdev_balanced" +GQADIR="/root/autodl-tmp/data/eval/gqa/" + + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +for IDX in $(seq 0 $((CHUNKS-1))); do + CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/gqa/$SPLIT.jsonl \ + --image-folder $EVAL_DIR/gqa/images \ + --answers-file $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --temperature 0 \ + --conv-mode v1 & +done + +wait + +output_file=$EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +# Loop through the indices and concatenate each file. +for IDX in $(seq 0 $((CHUNKS-1))); do + cat $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file" +done + +python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json + +cd $GQADIR +python eval/eval.py --tier testdev_balanced diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/mmbench_cn-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/mmbench_cn-checkpoint.sh new file mode 100644 index 0000000..43e96eb --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/mmbench_cn-checkpoint.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +SPLIT="mmbench_dev_cn_20231003" + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-clip-vit-large-patch14-336-finetune" +MODEL_NAME="tiny-llava-v1-1.1b" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_mmbench \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/mmbench_cn/$SPLIT.tsv \ + --answers-file $EVAL_DIR/mmbench_cn/answers/$SPLIT/$MODEL_NAME.jsonl \ + --lang cn \ + --single-pred-prompt \ + --temperature 0 \ + --conv-mode vicuna_v1 + +mkdir -p $EVAL_DIR/mmbench/answers_upload/$SPLIT + +python scripts/convert_mmbench_for_submission.py \ + --annotation-file $EVAL_DIR/mmbench_cn/$SPLIT.tsv \ + --result-dir $EVAL_DIR/mmbench_cn/answers/$SPLIT \ + --upload-dir $EVAL_DIR/mmbench_cn/answers_upload/$SPLIT \ + --experiment $MODEL_NAME diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/pope-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/pope-checkpoint.sh new file mode 100644 index 0000000..c012cde --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/pope-checkpoint.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + + +MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}" +MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \ + --image-folder $EVAL_DIR/pope/val2014 \ + --answers-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --model-base $MODEL_BASE \ + --conv-mode tiny_llama + +python llava/eval/eval_pope.py \ + --annotation-dir $EVAL_DIR/pope/coco \ + --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \ + --result-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/pope_v1-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/pope_v1-checkpoint.sh new file mode 100644 index 0000000..f45a3e6 --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/pope_v1-checkpoint.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-base-patch16 +VT_VARIANT="${VT_VERSION#*/}" + + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \ + --image-folder $EVAL_DIR/pope/val2014 \ + --answers-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --conv-mode v1 + +python llava/eval/eval_pope.py \ + --annotation-dir $EVAL_DIR/pope/coco \ + --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \ + --result-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa-checkpoint.sh new file mode 100644 index 0000000..fb49171 --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa-checkpoint.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}" +MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_science \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/scienceqa/llava_test_CQM-A.json \ + --image-folder $EVAL_DIR/scienceqa/images/test \ + --answers-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \ + --single-pred-prompt \ + --temperature 0 \ + --model-base $MODEL_BASE \ + --conv-mode tiny_llama + +python llava/eval/eval_science_qa.py \ + --base-dir $EVAL_DIR/scienceqa \ + --result-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \ + --output-file $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_output.jsonl \ + --output-result $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_result.json + diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa_v1-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa_v1-checkpoint.sh new file mode 100644 index 0000000..ca329b9 --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa_v1-checkpoint.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-base-patch16 +VT_VARIANT="${VT_VERSION#*/}" + + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_science \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/scienceqa/llava_test_CQM-A.json \ + --image-folder $EVAL_DIR/scienceqa/images/test \ + --answers-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \ + --single-pred-prompt \ + --temperature 0 \ + --conv-mode v1 + +python llava/eval/eval_science_qa.py \ + --base-dir $EVAL_DIR/scienceqa \ + --result-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \ + --output-file $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_output.jsonl \ + --output-result $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_result.json + diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa-checkpoint.sh new file mode 100644 index 0000000..5460c46 --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa-checkpoint.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-base-patch16 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}" +MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/textvqa/llava_textvqa_val_v051_ocr.jsonl \ + --image-folder $EVAL_DIR/textvqa/train_images \ + --answers-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --model-base $MODEL_BASE \ + --conv-mode tiny_llama + +python -m llava.eval.eval_textvqa \ + --annotation-file $EVAL_DIR/textvqa/TextVQA_0.5.1_val.json \ + --result-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa_v1-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa_v1-checkpoint.sh new file mode 100644 index 0000000..9aa79c6 --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa_v1-checkpoint.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}" + +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/textvqa/llava_textvqa_val_v051_ocr.jsonl \ + --image-folder $EVAL_DIR/textvqa/train_images \ + --answers-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --conv-mode v1 + +python -m llava.eval.eval_textvqa \ + --annotation-file $EVAL_DIR/textvqa/TextVQA_0.5.1_val.json \ + --result-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz-checkpoint.sh new file mode 100644 index 0000000..dbe3282 --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz-checkpoint.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +# MODEL_PATH="./checkpoints/tiny-llava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune" +# MODEL_NAME="tiny-llava-v1-1.1b-sharegpt4v" +# MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/vizwiz/llava_test.jsonl \ + --image-folder $EVAL_DIR/vizwiz/test \ + --answers-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --model-base $MODEL_BASE \ + --conv-mode tiny_llama + +python scripts/convert_vizwiz_for_submission.py \ + --annotation-file $EVAL_DIR/vizwiz/llava_test.jsonl \ + --result-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \ + --result-upload-file $EVAL_DIR/vizwiz/answers_upload/$MODEL_NAME.json diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz_v1-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz_v1-checkpoint.sh new file mode 100644 index 0000000..f7b075a --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz_v1-checkpoint.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +# MODEL_PATH="./checkpoints/tiny-llava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune" +# MODEL_NAME="tiny-llava-v1-1.1b-sharegpt4v" +# MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}" + +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/vizwiz/llava_test.jsonl \ + --image-folder $EVAL_DIR/vizwiz/test \ + --answers-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --conv-mode v1 + +python scripts/convert_vizwiz_for_submission.py \ + --annotation-file $EVAL_DIR/vizwiz/llava_test.jsonl \ + --result-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \ + --result-upload-file $EVAL_DIR/vizwiz/answers_upload/$MODEL_NAME.json diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/vqav2-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/vqav2-checkpoint.sh new file mode 100644 index 0000000..d22101a --- /dev/null +++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/vqav2-checkpoint.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +gpu_list="${CUDA_VISIBLE_DEVICES:-0}" +IFS=',' read -ra GPULIST <<< "$gpu_list" + +CHUNKS=${#GPULIST[@]} + +SPLIT="llava_vqav2_mscoco_test-dev2015" + +#LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +#LLM_VARIANT="${LLM_VERSION#*/}" +# +#VT_VERSION=openai/clip-vit-large-patch14-336 +#VT_VARIANT="${VT_VERSION#*/}" +# +#MODEL_PATH="./checkpoints/tiny-llava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune" +#MODEL_NAME="tiny-llava-v1-1.1b-sharegpt4v" +#EVAL_DIR="/root/autodl-tmp/data/eval" +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +for IDX in $(seq 0 $((CHUNKS-1))); do + CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/vqav2/$SPLIT.jsonl \ + --image-folder $EVAL_DIR/vqav2/test2015 \ + --answers-file $EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --temperature 0 \ + --model-base $MODEL_BASE + --conv-mode tiny_llama & +done + +wait + +output_file=$EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +# Loop through the indices and concatenate each file. +for IDX in $(seq 0 $((CHUNKS-1))); do + cat $EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file" +done + +python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $MODEL_NAME --dir $EVAL_DIR/vqav2 + diff --git a/scripts/tiny_llava/eval/gqa.sh b/scripts/tiny_llava/eval/gqa.sh new file mode 100644 index 0000000..2828f01 --- /dev/null +++ b/scripts/tiny_llava/eval/gqa.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +gpu_list="${CUDA_VISIBLE_DEVICES:-0}" +IFS=',' read -ra GPULIST <<< "$gpu_list" + +CHUNKS=${#GPULIST[@]} + +SPLIT="llava_gqa_testdev_balanced" +GQADIR="/root/autodl-tmp/data/eval/gqa/" + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +VERSION=type-3 + +MODEL_PATH="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_BASE="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +#MODEL_BASE=$LLM_VERSION +MODEL_NAME="tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +for IDX in $(seq 0 $((CHUNKS-1))); do + CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m tinyllava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/gqa/$SPLIT.jsonl \ + --image-folder $EVAL_DIR/gqa/images \ + --answers-file $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --temperature 0 \ + --model-base $MODEL_BASE\ + --conv-mode v1 & +done + +wait + +output_file=$EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +# Loop through the indices and concatenate each file. +for IDX in $(seq 0 $((CHUNKS-1))); do + cat $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file" +done + +python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json + +cd $GQADIR +python eval/eval.py --tier testdev_balanced diff --git a/scripts/tiny_llava/eval/gqa_v1.sh b/scripts/tiny_llava/eval/gqa_v1.sh new file mode 100644 index 0000000..3425efc --- /dev/null +++ b/scripts/tiny_llava/eval/gqa_v1.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +gpu_list="${CUDA_VISIBLE_DEVICES:-0}" +IFS=',' read -ra GPULIST <<< "$gpu_list" + +CHUNKS=${#GPULIST[@]} + +SPLIT="llava_gqa_testdev_balanced" +GQADIR="/root/autodl-tmp/data/eval/gqa/" + + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +for IDX in $(seq 0 $((CHUNKS-1))); do + CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/gqa/$SPLIT.jsonl \ + --image-folder $EVAL_DIR/gqa/images \ + --answers-file $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --temperature 0 \ + --conv-mode v1 & +done + +wait + +output_file=$EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +# Loop through the indices and concatenate each file. +for IDX in $(seq 0 $((CHUNKS-1))); do + cat $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file" +done + +python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json + +cd $GQADIR +python eval/eval.py --tier testdev_balanced diff --git a/scripts/tiny_llava/eval/imagenet.sh b/scripts/tiny_llava/eval/imagenet.sh new file mode 100644 index 0000000..bc99781 --- /dev/null +++ b/scripts/tiny_llava/eval/imagenet.sh @@ -0,0 +1,25 @@ +torchrun --nnodes=1 \ +--standalone \ +--nproc-per-node=8 \ +tinyllava/eval/eval_clip_imagenet.py \ +--linear_probe True \ +--pretrained_path ./checkpoints/tiny-llava-sharegpt4v-unlock-vit-from-12-tune-entire-model-TinyLlama-1.1B-Chat-v1.0-clip-vit-large-patch14-336-pretrain/vision_tower \ +--train_data_path /mnt/data/sata/winci/datasets/ImageNet/train \ +--eval_data_path /mnt/data/sata/winci/datasets/ImageNet/val \ +--learning_rate 1e-2 \ +--num_train_epochs 5 \ +--per_device_train_batch_size 64 \ +--per_device_eval_batch_size 64 \ +--logging_strategy steps \ +--logging_steps 1 \ +--evaluation_strategy epoch \ +--save_strategy epoch \ +--load_best_model_at_end True \ +--save_total_limit 1 \ +--seed 42 \ +--do_train \ +--do_eval \ +--optim sgd \ +--output_dir ./checkpoints/eval_imagenet/ \ +--bf16 True \ +--logging_steps 1 \ diff --git a/scripts/tiny_llava/eval/mmbench.sh b/scripts/tiny_llava/eval/mmbench.sh new file mode 100644 index 0000000..3193a1b --- /dev/null +++ b/scripts/tiny_llava/eval/mmbench.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +SPLIT="mmbench_dev_20230712" + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-clip-vit-large-patch14-336-finetune" +MODEL_NAME="tiny-llava-v1-1.1b" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m tinyllava.eval.model_vqa_mmbench \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/mmbench/$SPLIT.tsv \ + --answers-file $EVAL_DIR/mmbench/answers/$SPLIT/$MODEL_NAME.jsonl \ + --single-pred-prompt \ + --temperature 0 \ + --conv-mode vicuna_v1 + +mkdir -p $EVAL_DIR/mmbench/answers_upload/$SPLIT + +python scripts/convert_mmbench_for_submission.py \ + --annotation-file $EVAL_DIR/mmbench/$SPLIT.tsv \ + --result-dir $EVAL_DIR/mmbench/answers/$SPLIT \ + --upload-dir $EVAL_DIR/mmbench/answers_upload/$SPLIT \ + --experiment $MODEL_NAME diff --git a/scripts/tiny_llava/eval/mmbench_cn.sh b/scripts/tiny_llava/eval/mmbench_cn.sh new file mode 100644 index 0000000..acc2fe4 --- /dev/null +++ b/scripts/tiny_llava/eval/mmbench_cn.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +SPLIT="mmbench_dev_cn_20231003" + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-clip-vit-large-patch14-336-finetune" +MODEL_NAME="tiny-llava-v1-1.1b" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m tinyllava.eval.model_vqa_mmbench \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/mmbench_cn/$SPLIT.tsv \ + --answers-file $EVAL_DIR/mmbench_cn/answers/$SPLIT/$MODEL_NAME.jsonl \ + --lang cn \ + --single-pred-prompt \ + --temperature 0 \ + --conv-mode vicuna_v1 + +mkdir -p $EVAL_DIR/mmbench/answers_upload/$SPLIT + +python scripts/convert_mmbench_for_submission.py \ + --annotation-file $EVAL_DIR/mmbench_cn/$SPLIT.tsv \ + --result-dir $EVAL_DIR/mmbench_cn/answers/$SPLIT \ + --upload-dir $EVAL_DIR/mmbench_cn/answers_upload/$SPLIT \ + --experiment $MODEL_NAME diff --git a/scripts/tiny_llava/eval/mme.sh b/scripts/tiny_llava/eval/mme.sh new file mode 100644 index 0000000..2f160e3 --- /dev/null +++ b/scripts/tiny_llava/eval/mme.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +VERSION="$1" +LLM_VERSION="$2" +LLM_VARIANT="${LLM_VERSION#*/}" +VT_VERSION=google/siglip-so400m-patch14-384 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune" + +EVAL_DIR="/mnt/data/sata/ssd/dataset/eval" + +python -m tinyllava.eval.model_vqa_loader \ + --model-path liuhaotian/llava-v1.5-13b \ + --question-file ./playground/data/eval/MME/llava_mme.jsonl \ + --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ + --answers-file ./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl \ + --temperature 0 \ + --conv-mode vicuna_v1 + +cd ./playground/data/eval/MME + +python convert_answer_to_mme.py --experiment llava-v1.5-13b + +cd eval_tool + +python calculation.py --results_dir answers/llava-v1.5-13b diff --git a/scripts/tiny_llava/eval/mmvet.sh b/scripts/tiny_llava/eval/mmvet.sh new file mode 100644 index 0000000..6b90127 --- /dev/null +++ b/scripts/tiny_llava/eval/mmvet.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-clip-vit-large-patch14-336-finetune" +MODEL_NAME="tiny-llava-v1-1.1b" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m tinyllava.eval.model_vqa \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/mm-vet/tinyllava-mm-vet.jsonl \ + --image-folder $EVAL_DIR/mm-vet/images \ + --answers-file $EVAL_DIR/mm-vet/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --conv-mode vicuna_v1 + +mkdir -p $MODEL_PATH/mm-vet/results + +python scripts/convert_mmvet_for_eval.py \ + --src $EVAL_DIR/mm-vet/answers/$MODEL_NAME.jsonl \ + --dst $EVAL_DIR/mm-vet/results/$MODEL_NAME.json + diff --git a/scripts/tiny_llava/eval/pope.sh b/scripts/tiny_llava/eval/pope.sh new file mode 100644 index 0000000..5e57d01 --- /dev/null +++ b/scripts/tiny_llava/eval/pope.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m tinyllava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \ + --image-folder $EVAL_DIR/pope/val2014 \ + --answers-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --conv-mode tiny_llama + +python tinyllava/eval/eval_pope.py \ + --annotation-dir $EVAL_DIR/pope/coco \ + --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \ + --result-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl diff --git a/scripts/tiny_llava/eval/pope_v1.sh b/scripts/tiny_llava/eval/pope_v1.sh new file mode 100644 index 0000000..f45a3e6 --- /dev/null +++ b/scripts/tiny_llava/eval/pope_v1.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-base-patch16 +VT_VARIANT="${VT_VERSION#*/}" + + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \ + --image-folder $EVAL_DIR/pope/val2014 \ + --answers-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --conv-mode v1 + +python llava/eval/eval_pope.py \ + --annotation-dir $EVAL_DIR/pope/coco \ + --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \ + --result-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl diff --git a/scripts/tiny_llava/eval/sqa.sh b/scripts/tiny_llava/eval/sqa.sh new file mode 100644 index 0000000..c8fbea4 --- /dev/null +++ b/scripts/tiny_llava/eval/sqa.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +VERSION=type-3 + +MODEL_PATH="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_BASE="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +#MODEL_BASE=$LLM_VERSION +MODEL_NAME="tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m tinyllava.eval.model_vqa_science \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/scienceqa/llava_test_CQM-A.json \ + --image-folder $EVAL_DIR/scienceqa/images/test \ + --answers-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \ + --single-pred-prompt \ + --temperature 0 \ + --model-base $MODEL_BASE \ + --conv-mode v1 + +python tinyllava/eval/eval_science_qa.py \ + --base-dir $EVAL_DIR/scienceqa \ + --result-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \ + --output-file $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_output.jsonl \ + --output-result $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_result.json diff --git a/scripts/tiny_llava/eval/sqa_v1.sh b/scripts/tiny_llava/eval/sqa_v1.sh new file mode 100644 index 0000000..ca329b9 --- /dev/null +++ b/scripts/tiny_llava/eval/sqa_v1.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-base-patch16 +VT_VARIANT="${VT_VERSION#*/}" + + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_science \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/scienceqa/llava_test_CQM-A.json \ + --image-folder $EVAL_DIR/scienceqa/images/test \ + --answers-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \ + --single-pred-prompt \ + --temperature 0 \ + --conv-mode v1 + +python llava/eval/eval_science_qa.py \ + --base-dir $EVAL_DIR/scienceqa \ + --result-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \ + --output-file $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_output.jsonl \ + --output-result $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_result.json + diff --git a/scripts/tiny_llava/eval/textvqa.sh b/scripts/tiny_llava/eval/textvqa.sh new file mode 100644 index 0000000..df50b72 --- /dev/null +++ b/scripts/tiny_llava/eval/textvqa.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +#MODEL_PATH="./checkpoints/tiny-tinyllava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +#MODEL_NAME="tiny-tinyllava-v1.5-${LLM_VARIANT}-${VT_VARIANT}" +#EVAL_DIR="/root/autodl-tmp/data/eval" +VERSION=type-3 +MODEL_PATH="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_BASE="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +#MODEL_BASE=$LLM_VERSION +MODEL_NAME="tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m tinyllava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/textvqa/llava_textvqa_val_v051_ocr.jsonl \ + --image-folder $EVAL_DIR/textvqa/train_images \ + --answers-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --model-base $MODEL_BASE \ + --conv-mode v1 + +python -m tinyllava.eval.eval_textvqa \ + --annotation-file $EVAL_DIR/textvqa/TextVQA_0.5.1_val.json \ + --result-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl diff --git a/scripts/tiny_llava/eval/textvqa_v1.sh b/scripts/tiny_llava/eval/textvqa_v1.sh new file mode 100644 index 0000000..9aa79c6 --- /dev/null +++ b/scripts/tiny_llava/eval/textvqa_v1.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}" + +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/textvqa/llava_textvqa_val_v051_ocr.jsonl \ + --image-folder $EVAL_DIR/textvqa/train_images \ + --answers-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --conv-mode v1 + +python -m llava.eval.eval_textvqa \ + --annotation-file $EVAL_DIR/textvqa/TextVQA_0.5.1_val.json \ + --result-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl diff --git a/scripts/tiny_llava/eval/vizwiz.sh b/scripts/tiny_llava/eval/vizwiz.sh new file mode 100644 index 0000000..46bc176 --- /dev/null +++ b/scripts/tiny_llava/eval/vizwiz.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1b-sharegpt4v" +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m tinyllava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/vizwiz/llava_test.jsonl \ + --image-folder $EVAL_DIR/vizwiz/test \ + --answers-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --conv-mode vicuna_v1 + +python scripts/convert_vizwiz_for_submission.py \ + --annotation-file $EVAL_DIR/vizwiz/llava_test.jsonl \ + --result-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \ + --result-upload-file $EVAL_DIR/vizwiz/answers_upload/$MODEL_NAME.json diff --git a/scripts/tiny_llava/eval/vizwiz_v1.sh b/scripts/tiny_llava/eval/vizwiz_v1.sh new file mode 100644 index 0000000..f7b075a --- /dev/null +++ b/scripts/tiny_llava/eval/vizwiz_v1.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +# MODEL_PATH="./checkpoints/tiny-llava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune" +# MODEL_NAME="tiny-llava-v1-1.1b-sharegpt4v" +# MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune" +MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}" + +EVAL_DIR="/root/autodl-tmp/data/eval" + +python -m llava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/vizwiz/llava_test.jsonl \ + --image-folder $EVAL_DIR/vizwiz/test \ + --answers-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \ + --temperature 0 \ + --conv-mode v1 + +python scripts/convert_vizwiz_for_submission.py \ + --annotation-file $EVAL_DIR/vizwiz/llava_test.jsonl \ + --result-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \ + --result-upload-file $EVAL_DIR/vizwiz/answers_upload/$MODEL_NAME.json diff --git a/scripts/tiny_llava/eval/vqav2.sh b/scripts/tiny_llava/eval/vqav2.sh new file mode 100644 index 0000000..4d076c4 --- /dev/null +++ b/scripts/tiny_llava/eval/vqav2.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +gpu_list="${CUDA_VISIBLE_DEVICES:-0}" +IFS=',' read -ra GPULIST <<< "$gpu_list" + +CHUNKS=${#GPULIST[@]} + +SPLIT="llava_vqav2_mscoco_test-dev2015" + +#LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +#LLM_VARIANT="${LLM_VERSION#*/}" +# +#VT_VERSION=openai/clip-vit-large-patch14-336 +#VT_VARIANT="${VT_VERSION#*/}" +# +#MODEL_PATH="./checkpoints/tiny-tinyllava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune" +#MODEL_NAME="tiny-tinyllava-v1-1.1b-sharegpt4v" +#EVAL_DIR="/root/autodl-tmp/data/eval" +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +LLM_VARIANT="${LLM_VERSION#*/}" + +VT_VERSION=openai/clip-vit-large-patch14-336 +VT_VARIANT="${VT_VERSION#*/}" + +MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora" +MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain" +MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}" +EVAL_DIR="/root/autodl-tmp/data/eval" + +for IDX in $(seq 0 $((CHUNKS-1))); do + CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m tinyllava.eval.model_vqa_loader \ + --model-path $MODEL_PATH \ + --question-file $EVAL_DIR/vqav2/$SPLIT.jsonl \ + --image-folder $EVAL_DIR/vqav2/test2015 \ + --answers-file $EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --temperature 0 \ + --conv-mode vicuna_v1 & +done + +wait + +output_file=$EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/merge.jsonl + +# Clear out the output file if it exists. +> "$output_file" + +# Loop through the indices and concatenate each file. +for IDX in $(seq 0 $((CHUNKS-1))); do + cat $EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file" +done + +python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $MODEL_NAME --dir $EVAL_DIR/vqav2 + diff --git a/scripts/tiny_llava/finetune.sh b/scripts/tiny_llava/finetune.sh new file mode 100644 index 0000000..ceca30a --- /dev/null +++ b/scripts/tiny_llava/finetune.sh @@ -0,0 +1,56 @@ +#!/bin/bash +if [ $# -ne 5 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the arguments to variables +LLM_VERSION="$1" +VT_VERSION="$2" +DATA_PATH="$3" +IMAGE_PATH="$4" +VERSION="$5" + +# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +# VT_VERSION=openai/clip-vit-base-patch16 +# DATA_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data/text_files/llava_v1_5_mix665k.json +# IMAGE_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data +VT_VARIANT="${VT_VERSION#*/}" +LLM_VARIANT="${LLM_VERSION#*/}" + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path $LLM_VERSION \ + --version v1 \ + --data_path $DATA_PATH \ + --image_folder $IMAGE_PATH\ + --vision_tower $VT_VERSION \ + --pretrain_mm_mlp_adapter ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain/mm_projector.bin \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --image_aspect_ratio pad \ + --group_by_modality_length True \ + --fp16 True \ + --output_dir ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune \ + --num_train_epochs 1 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 5000 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 30 \ + --lazy_preprocess True \ + --report_to wandb \ + --run_name llava-${VERSION}-finetune-${LLM_VARIANT}-${VT_VARIANT} diff --git a/scripts/tiny_llava/finetune_lora.sh b/scripts/tiny_llava/finetune_lora.sh new file mode 100644 index 0000000..9b3f9e6 --- /dev/null +++ b/scripts/tiny_llava/finetune_lora.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +if [ $# -ne 5 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the arguments to variables +LLM_VERSION="$1" +VT_VERSION="$2" +DATA_PATH="$3" +IMAGE_PATH="$4" +VERSION="$5" + +VT_VARIANT="${VT_VERSION#*/}" +LLM_VARIANT="${LLM_VERSION#*/}" + +# --model_name_or_path ./checkpoints/tiny-tinyllava-type-2-${LLM_VARIANT}-${VT_VARIANT}-pretrain +deepspeed llava/train/train_mem.py \ + --lora_enable True --lora_r 128 --lora_alpha 256 \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path $LLM_VERSION \ + --pretrain_mm_mlp_adapter ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain/mm_projector.bin \ + --version v1 \ + --data_path $DATA_PATH \ + --image_folder $IMAGE_PATH \ + --vision_tower $VT_VERSION \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --image_aspect_ratio pad \ + --group_by_modality_length True \ + --fp16 True \ + --output_dir ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora \ + --num_train_epochs 1 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 5000 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 30 \ + --lazy_preprocess True \ + --report_to wandb \ + --run_name tiny-llava-${VERSION}-finetune-lora-${LLM_VARIANT}-${VT_VARIANT} diff --git a/scripts/tiny_llava/finetune_lora_llm_open.sh b/scripts/tiny_llava/finetune_lora_llm_open.sh new file mode 100644 index 0000000..e69de29 diff --git a/scripts/tiny_llava/finetune_lora_type3.sh b/scripts/tiny_llava/finetune_lora_type3.sh new file mode 100644 index 0000000..410f5a1 --- /dev/null +++ b/scripts/tiny_llava/finetune_lora_type3.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +if [ $# -ne 5 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the arguments to variables +LLM_VERSION="$1" +VT_VERSION="$2" +DATA_PATH="$3" +IMAGE_PATH="$4" +VERSION="$5" + +VT_VARIANT="${VT_VERSION#*/}" +LLM_VARIANT="${LLM_VERSION#*/}" + +# --model_name_or_path ./checkpoints/tiny-tinyllava-type-2-${LLM_VARIANT}-${VT_VARIANT}-pretrain +deepspeed llava/train/train_mem.py \ + --lora_enable True --lora_r 128 --lora_alpha 256 \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain \ + --version v1 \ + --data_path $DATA_PATH \ + --image_folder $IMAGE_PATH \ + --vision_tower $VT_VERSION \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --image_aspect_ratio pad \ + --group_by_modality_length True \ + --fp16 True \ + --output_dir ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora \ + --num_train_epochs 1 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 5000 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 30 \ + --lazy_preprocess True \ + --report_to wandb \ + --run_name tiny-llava-${VERSION}-finetune-lora-${LLM_VARIANT}-${VT_VARIANT} diff --git a/scripts/tiny_llava/finetune_resamplers.sh b/scripts/tiny_llava/finetune_resamplers.sh new file mode 100644 index 0000000..e69de29 diff --git a/scripts/tiny_llava/finetune_type4.sh b/scripts/tiny_llava/finetune_type4.sh new file mode 100644 index 0000000..83be89a --- /dev/null +++ b/scripts/tiny_llava/finetune_type4.sh @@ -0,0 +1,55 @@ +#!/bin/bash +if [ $# -ne 5 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the arguments to variables +LLM_VERSION="$1" +VT_VERSION="$2" +DATA_PATH="$3" +IMAGE_PATH="$4" +VERSION="$5" + +# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +# VT_VERSION=openai/clip-vit-base-patch16 +# DATA_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data/text_files/llava_v1_5_mix665k.json +# IMAGE_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data +VT_VARIANT="${VT_VERSION#*/}" +LLM_VARIANT="${LLM_VERSION#*/}" + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain \ + --version v1 \ + --data_path $DATA_PATH \ + --image_folder $IMAGE_PATH\ + --vision_tower $VT_VERSION \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --image_aspect_ratio pad \ + --group_by_modality_length True \ + --fp16 True \ + --output_dir ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune \ + --num_train_epochs 1 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 5000 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 30 \ + --lazy_preprocess True \ + --report_to wandb \ + --run_name llava-${VERSION}-finetune-${LLM_VARIANT}-${VT_VARIANT} diff --git a/scripts/tiny_llava/pretrain.sh b/scripts/tiny_llava/pretrain.sh new file mode 100755 index 0000000..23be997 --- /dev/null +++ b/scripts/tiny_llava/pretrain.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +if [ $# -ne 5 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the arguments to variables +LLM_VERSION="$1" +VT_VERSION="$2" +DATA_PATH="$3" +IMAGE_PATH="$4" +VERSION="$5" + +# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +# VT_VERSION=openai/clip-vit-base-patch16 +# DATA_PATH=/root/autodl-tmp/data/tinyllava/blip_laion_cc_sbu_558k.json +# IMAGE_PATH=/root/autodl-tmp/data/tinyllava/llava_pretrain/images +VT_VARIANT="${VT_VERSION#*/}" +LLM_VARIANT="${LLM_VERSION#*/}" + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path $LLM_VERSION \ + --version plain \ + --data_path $DATA_PATH\ + --image_folder $IMAGE_PATH \ + --vision_tower $VT_VERSION \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --tune_mm_mlp_adapter True \ + --fp16 True \ + --output_dir ./checkpoints/tiny-llava-"${VERSION}"-"${LLM_VARIANT}"-"${VT_VARIANT}"-pretrain \ + --num_train_epochs 1 \ + --per_device_train_batch_size 32 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 2400 \ + --save_total_limit 1 \ + --learning_rate 1e-3 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 30 \ + --lazy_preprocess True \ + --report_to wandb \ + --run_name tiny-llava-"${VERSION}"-pretrain-"${LLM_VARIANT}"-"${VT_VARIANT}" diff --git a/scripts/tiny_llava/pretrain_baselines.sh b/scripts/tiny_llava/pretrain_baselines.sh new file mode 100644 index 0000000..1bacde2 --- /dev/null +++ b/scripts/tiny_llava/pretrain_baselines.sh @@ -0,0 +1,9 @@ +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +VT_VERSIONS=(openai/clip-vit-large-patch14-336 openai/clip-vit-large-patch14 openai/clip-vit-base-patch16 openai/clip-vit-large-patch32) +DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json +IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images + + +for VT_VERSION in "${VT_VERSIONS[@]}"; do + bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" +done diff --git a/scripts/tiny_llava/pretrain_llm_open.sh b/scripts/tiny_llava/pretrain_llm_open.sh new file mode 100644 index 0000000..e69de29 diff --git a/scripts/tiny_llava/pretrain_type3.sh b/scripts/tiny_llava/pretrain_type3.sh new file mode 100644 index 0000000..df53ca8 --- /dev/null +++ b/scripts/tiny_llava/pretrain_type3.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +if [ $# -ne 5 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the arguments to variables +LLM_VERSION="$1" +VT_VERSION="$2" +DATA_PATH="$3" +IMAGE_PATH="$4" +VERSION="$5" + +# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +# VT_VERSION=openai/clip-vit-base-patch16 +# DATA_PATH=/root/autodl-tmp/data/tinyllava/blip_laion_cc_sbu_558k.json +# IMAGE_PATH=/root/autodl-tmp/data/tinyllava/llava_pretrain/images +VT_VARIANT="${VT_VERSION#*/}" +LLM_VARIANT="${LLM_VERSION#*/}" + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path $LLM_VERSION \ + --version plain \ + --data_path $DATA_PATH\ + --image_folder $IMAGE_PATH \ + --vision_tower $VT_VERSION \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --fp16 True \ + --output_dir ./checkpoints/tiny-llava-"${VERSION}"-"${LLM_VARIANT}"-"${VT_VARIANT}"-pretrain \ + --num_train_epochs 1 \ + --per_device_train_batch_size 32 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 2400 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 30 \ + --lazy_preprocess True \ + --report_to wandb \ + --run_name tiny-llava-"${VERSION}"-pretrain-"${LLM_VARIANT}"-"${VT_VARIANT}" diff --git a/scripts/tiny_llava/pretrain_type4.sh b/scripts/tiny_llava/pretrain_type4.sh new file mode 100644 index 0000000..df53ca8 --- /dev/null +++ b/scripts/tiny_llava/pretrain_type4.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +if [ $# -ne 5 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign the arguments to variables +LLM_VERSION="$1" +VT_VERSION="$2" +DATA_PATH="$3" +IMAGE_PATH="$4" +VERSION="$5" + +# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +# VT_VERSION=openai/clip-vit-base-patch16 +# DATA_PATH=/root/autodl-tmp/data/tinyllava/blip_laion_cc_sbu_558k.json +# IMAGE_PATH=/root/autodl-tmp/data/tinyllava/llava_pretrain/images +VT_VARIANT="${VT_VERSION#*/}" +LLM_VARIANT="${LLM_VERSION#*/}" + +deepspeed llava/train/train_mem.py \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path $LLM_VERSION \ + --version plain \ + --data_path $DATA_PATH\ + --image_folder $IMAGE_PATH \ + --vision_tower $VT_VERSION \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --fp16 True \ + --output_dir ./checkpoints/tiny-llava-"${VERSION}"-"${LLM_VARIANT}"-"${VT_VARIANT}"-pretrain \ + --num_train_epochs 1 \ + --per_device_train_batch_size 32 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 2400 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 False \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 30 \ + --lazy_preprocess True \ + --report_to wandb \ + --run_name tiny-llava-"${VERSION}"-pretrain-"${LLM_VARIANT}"-"${VT_VARIANT}" diff --git a/scripts/tiny_llava/train_baselines.sh b/scripts/tiny_llava/train_baselines.sh new file mode 100644 index 0000000..494730b --- /dev/null +++ b/scripts/tiny_llava/train_baselines.sh @@ -0,0 +1,31 @@ +#LLM_VERSION=susnato/phi-1_5_dev +LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +#VT_VERSIONS=(openai/clip-vit-large-patch14-336 openai/clip-vit-large-patch14 openai/clip-vit-base-patch16 openai/clip-vit-base-patch32) +VT_VERSIONS=(openai/clip-vit-large-patch14-336) +#DATA_PATH=/root/autodl-tmp/data/text_files/really_cleaned_share-captioner_coco_lcs_sam_1246k_1107.json +DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json +#FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/cleaned_sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json +FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/llava_v1_5_mix665k.json +#IMAGE_PATH=/root/autodl-tmp/data/ +#FINETUNE_IMAGE_PATH=/root/autodl-tmp/data +IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images +FINETUNE_IMAGE_PATH=/root/autodl-tmp/data + +## type 2 training +#for VT_VERSION in "${VT_VERSIONS[@]}"; do +## bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" "type-2" +# bash scripts/tiny_llava/finetune_lora.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" type-2 +#done +# +# +## type 3 training +#for VT_VERSION in "${VT_VERSIONS[@]}"; do +# bash scripts/tiny_llava/pretrain_type3.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" type-3 +# bash scripts/tiny_llava/finetune_lora_type3.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" type-3 +#done + +# type-4 training +for VT_VERSION in "${VT_VERSIONS[@]}"; do + bash scripts/tiny_llava/pretrain_type4.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" type-4 + bash scripts/tiny_llava/finetune_type4.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" type-4 +done diff --git a/scripts/tiny_llava/train_baselines_phi_jia.sh b/scripts/tiny_llava/train_baselines_phi_jia.sh new file mode 100644 index 0000000..ee1973a --- /dev/null +++ b/scripts/tiny_llava/train_baselines_phi_jia.sh @@ -0,0 +1,18 @@ +LLM_VERSION=susnato/phi-1_5_dev +# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0 +VT_VERSIONS=(openai/clip-vit-large-patch14-336) +#DATA_PATH=/root/autodl-tmp/data/text_files/really_cleaned_share-captioner_coco_lcs_sam_1246k_1107.json +DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json +#FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/cleaned_sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json +FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/llava_v1_5_mix665k.json +#IMAGE_PATH=/root/autodl-tmp/data/ +#FINETUNE_IMAGE_PATH=/root/autodl-tmp/data +IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images +FINETUNE_IMAGE_PATH=/root/autodl-tmp/data + + +for VT_VERSION in "${VT_VERSIONS[@]}"; do + # bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" + bash scripts/tiny_llava/finetune_lora.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" +done + diff --git a/scripts/zero2.json b/scripts/zero2.json new file mode 100644 index 0000000..7a01fda --- /dev/null +++ b/scripts/zero2.json @@ -0,0 +1,23 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "train_micro_batch_size_per_gpu": "auto", + "train_batch_size": "auto", + "gradient_accumulation_steps": "auto", + "zero_optimization": { + "stage": 2, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto" + } +} \ No newline at end of file diff --git a/scripts/zero3.json b/scripts/zero3.json new file mode 100644 index 0000000..8ff461f --- /dev/null +++ b/scripts/zero3.json @@ -0,0 +1,28 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "train_micro_batch_size_per_gpu": "auto", + "train_batch_size": "auto", + "gradient_accumulation_steps": "auto", + "zero_optimization": { + "stage": 3, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + } +} \ No newline at end of file diff --git a/scripts/zero3_offload.json b/scripts/zero3_offload.json new file mode 100644 index 0000000..2dcde84 --- /dev/null +++ b/scripts/zero3_offload.json @@ -0,0 +1,56 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "steps_per_print": 1e5, + "wall_clock_breakdown": false +}