From d19cdd86260c41dfe73a1283dbce9dfa8893c8f3 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 23 Feb 2024 14:32:40 +0800
Subject: [PATCH] add scripts

---
 .../.ipynb_checkpoints/zero3-checkpoint.json  |  28 +
 scripts/convert_gqa_for_eval.py               |  18 +
 scripts/convert_mmbench_for_submission.py     |  27 +
 scripts/convert_mmvet_for_eval.py             |  18 +
 scripts/convert_seed_for_submission.py        |  74 +++
 scripts/convert_sqa_to_llava.py               |  88 +++
 scripts/convert_sqa_to_llava_base_prompt.py   | 334 +++++++++++
 scripts/convert_vizwiz_for_submission.py      |  47 ++
 scripts/convert_vqav2_for_submission.py       |  56 ++
 scripts/extract_mm_projector.py               |  47 ++
 scripts/finetune.sh                           |  48 ++
 scripts/finetune_full_schedule.sh             |  48 ++
 scripts/finetune_lora.sh                      |  49 ++
 scripts/finetune_qlora.sh                     |  50 ++
 scripts/finetune_sqa.sh                       |  36 ++
 scripts/merge_lora_weights.py                 |  22 +
 scripts/pretrain.sh                           |  46 ++
 scripts/pretrain_xformers.sh                  |  44 ++
 scripts/sqa_eval_batch.sh                     |  13 +
 scripts/sqa_eval_gather.sh                    |  18 +
 .../.ipynb_checkpoints/finetune-checkpoint.sh |  55 ++
 .../finetune_lora-checkpoint.sh               |  53 ++
 .../.ipynb_checkpoints/pretrain-checkpoint.sh |  54 ++
 .../pretrain_baselines-checkpoint.sh          |   9 +
 .../train_baselines-checkpoint.sh             |  15 +
 .../train_baselines_phi_jia-checkpoint.sh     |  18 +
 scripts/tiny_llava/LOGS/LOG.md                | 561 ++++++++++++++++++
 scripts/tiny_llava/LOGS/REPORT.md             |  12 +
 scripts/tiny_llava/LOGS/organized_log.md      | 215 +++++++
 scripts/tiny_llava/docs/LOG.md                |   0
 .../eval/.ipynb_checkpoints/gqa-checkpoint.sh |  50 ++
 .../.ipynb_checkpoints/gqa_v1-checkpoint.sh   |  46 ++
 .../mmbench_cn-checkpoint.sh                  |  24 +
 .../.ipynb_checkpoints/pope-checkpoint.sh     |  27 +
 .../.ipynb_checkpoints/pope_v1-checkpoint.sh  |  25 +
 .../eval/.ipynb_checkpoints/sqa-checkpoint.sh |  29 +
 .../.ipynb_checkpoints/sqa_v1-checkpoint.sh   |  28 +
 .../.ipynb_checkpoints/textvqa-checkpoint.sh  |  25 +
 .../textvqa_v1-checkpoint.sh                  |  24 +
 .../.ipynb_checkpoints/vizwiz-checkpoint.sh   |  29 +
 .../vizwiz_v1-checkpoint.sh                   |  28 +
 .../.ipynb_checkpoints/vqav2-checkpoint.sh    |  56 ++
 scripts/tiny_llava/eval/gqa.sh                |  53 ++
 scripts/tiny_llava/eval/gqa_v1.sh             |  46 ++
 scripts/tiny_llava/eval/imagenet.sh           |  25 +
 scripts/tiny_llava/eval/mmbench.sh            |  23 +
 scripts/tiny_llava/eval/mmbench_cn.sh         |  24 +
 scripts/tiny_llava/eval/mme.sh                |  27 +
 scripts/tiny_llava/eval/mmvet.sh              |  20 +
 scripts/tiny_llava/eval/pope.sh               |  18 +
 scripts/tiny_llava/eval/pope_v1.sh            |  25 +
 scripts/tiny_llava/eval/sqa.sh                |  31 +
 scripts/tiny_llava/eval/sqa_v1.sh             |  28 +
 scripts/tiny_llava/eval/textvqa.sh            |  30 +
 scripts/tiny_llava/eval/textvqa_v1.sh         |  24 +
 scripts/tiny_llava/eval/vizwiz.sh             |  24 +
 scripts/tiny_llava/eval/vizwiz_v1.sh          |  28 +
 scripts/tiny_llava/eval/vqav2.sh              |  55 ++
 scripts/tiny_llava/finetune.sh                |  56 ++
 scripts/tiny_llava/finetune_lora.sh           |  55 ++
 scripts/tiny_llava/finetune_lora_llm_open.sh  |   0
 scripts/tiny_llava/finetune_lora_type3.sh     |  54 ++
 scripts/tiny_llava/finetune_resamplers.sh     |   0
 scripts/tiny_llava/finetune_type4.sh          |  55 ++
 scripts/tiny_llava/pretrain.sh                |  55 ++
 scripts/tiny_llava/pretrain_baselines.sh      |   9 +
 scripts/tiny_llava/pretrain_llm_open.sh       |   0
 scripts/tiny_llava/pretrain_type3.sh          |  54 ++
 scripts/tiny_llava/pretrain_type4.sh          |  54 ++
 scripts/tiny_llava/train_baselines.sh         |  31 +
 scripts/tiny_llava/train_baselines_phi_jia.sh |  18 +
 scripts/zero2.json                            |  23 +
 scripts/zero3.json                            |  28 +
 scripts/zero3_offload.json                    |  56 ++
 74 files changed, 3523 insertions(+)
 create mode 100644 scripts/.ipynb_checkpoints/zero3-checkpoint.json
 create mode 100644 scripts/convert_gqa_for_eval.py
 create mode 100644 scripts/convert_mmbench_for_submission.py
 create mode 100644 scripts/convert_mmvet_for_eval.py
 create mode 100644 scripts/convert_seed_for_submission.py
 create mode 100644 scripts/convert_sqa_to_llava.py
 create mode 100644 scripts/convert_sqa_to_llava_base_prompt.py
 create mode 100644 scripts/convert_vizwiz_for_submission.py
 create mode 100644 scripts/convert_vqav2_for_submission.py
 create mode 100644 scripts/extract_mm_projector.py
 create mode 100644 scripts/finetune.sh
 create mode 100644 scripts/finetune_full_schedule.sh
 create mode 100644 scripts/finetune_lora.sh
 create mode 100644 scripts/finetune_qlora.sh
 create mode 100644 scripts/finetune_sqa.sh
 create mode 100644 scripts/merge_lora_weights.py
 create mode 100644 scripts/pretrain.sh
 create mode 100644 scripts/pretrain_xformers.sh
 create mode 100644 scripts/sqa_eval_batch.sh
 create mode 100644 scripts/sqa_eval_gather.sh
 create mode 100644 scripts/tiny_llava/.ipynb_checkpoints/finetune-checkpoint.sh
 create mode 100644 scripts/tiny_llava/.ipynb_checkpoints/finetune_lora-checkpoint.sh
 create mode 100755 scripts/tiny_llava/.ipynb_checkpoints/pretrain-checkpoint.sh
 create mode 100644 scripts/tiny_llava/.ipynb_checkpoints/pretrain_baselines-checkpoint.sh
 create mode 100644 scripts/tiny_llava/.ipynb_checkpoints/train_baselines-checkpoint.sh
 create mode 100644 scripts/tiny_llava/.ipynb_checkpoints/train_baselines_phi_jia-checkpoint.sh
 create mode 100644 scripts/tiny_llava/LOGS/LOG.md
 create mode 100644 scripts/tiny_llava/LOGS/REPORT.md
 create mode 100644 scripts/tiny_llava/LOGS/organized_log.md
 create mode 100644 scripts/tiny_llava/docs/LOG.md
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/gqa-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/gqa_v1-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/mmbench_cn-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/pope-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/pope_v1-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/sqa-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/sqa_v1-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa_v1-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz_v1-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/.ipynb_checkpoints/vqav2-checkpoint.sh
 create mode 100644 scripts/tiny_llava/eval/gqa.sh
 create mode 100644 scripts/tiny_llava/eval/gqa_v1.sh
 create mode 100644 scripts/tiny_llava/eval/imagenet.sh
 create mode 100644 scripts/tiny_llava/eval/mmbench.sh
 create mode 100644 scripts/tiny_llava/eval/mmbench_cn.sh
 create mode 100644 scripts/tiny_llava/eval/mme.sh
 create mode 100644 scripts/tiny_llava/eval/mmvet.sh
 create mode 100644 scripts/tiny_llava/eval/pope.sh
 create mode 100644 scripts/tiny_llava/eval/pope_v1.sh
 create mode 100644 scripts/tiny_llava/eval/sqa.sh
 create mode 100644 scripts/tiny_llava/eval/sqa_v1.sh
 create mode 100644 scripts/tiny_llava/eval/textvqa.sh
 create mode 100644 scripts/tiny_llava/eval/textvqa_v1.sh
 create mode 100644 scripts/tiny_llava/eval/vizwiz.sh
 create mode 100644 scripts/tiny_llava/eval/vizwiz_v1.sh
 create mode 100644 scripts/tiny_llava/eval/vqav2.sh
 create mode 100644 scripts/tiny_llava/finetune.sh
 create mode 100644 scripts/tiny_llava/finetune_lora.sh
 create mode 100644 scripts/tiny_llava/finetune_lora_llm_open.sh
 create mode 100644 scripts/tiny_llava/finetune_lora_type3.sh
 create mode 100644 scripts/tiny_llava/finetune_resamplers.sh
 create mode 100644 scripts/tiny_llava/finetune_type4.sh
 create mode 100755 scripts/tiny_llava/pretrain.sh
 create mode 100644 scripts/tiny_llava/pretrain_baselines.sh
 create mode 100644 scripts/tiny_llava/pretrain_llm_open.sh
 create mode 100644 scripts/tiny_llava/pretrain_type3.sh
 create mode 100644 scripts/tiny_llava/pretrain_type4.sh
 create mode 100644 scripts/tiny_llava/train_baselines.sh
 create mode 100644 scripts/tiny_llava/train_baselines_phi_jia.sh
 create mode 100644 scripts/zero2.json
 create mode 100644 scripts/zero3.json
 create mode 100644 scripts/zero3_offload.json

diff --git a/scripts/.ipynb_checkpoints/zero3-checkpoint.json b/scripts/.ipynb_checkpoints/zero3-checkpoint.json
new file mode 100644
index 0000000..8ff461f
--- /dev/null
+++ b/scripts/.ipynb_checkpoints/zero3-checkpoint.json
@@ -0,0 +1,28 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
\ No newline at end of file
diff --git a/scripts/convert_gqa_for_eval.py b/scripts/convert_gqa_for_eval.py
new file mode 100644
index 0000000..18f2a8e
--- /dev/null
+++ b/scripts/convert_gqa_for_eval.py
@@ -0,0 +1,18 @@
+import os
+import json
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--src", type=str)
+parser.add_argument("--dst", type=str)
+args = parser.parse_args()
+
+all_answers = []
+for line_idx, line in enumerate(open(args.src)):
+    res = json.loads(line)
+    question_id = res['question_id']
+    text = res['text'].rstrip('.').lower()
+    all_answers.append({"questionId": question_id, "prediction": text})
+
+with open(args.dst, 'w') as f:
+    json.dump(all_answers, f)
diff --git a/scripts/convert_mmbench_for_submission.py b/scripts/convert_mmbench_for_submission.py
new file mode 100644
index 0000000..fd4b673
--- /dev/null
+++ b/scripts/convert_mmbench_for_submission.py
@@ -0,0 +1,27 @@
+import os
+import json
+import argparse
+import pandas as pd
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--annotation-file", type=str, required=True)
+    parser.add_argument("--result-dir", type=str, required=True)
+    parser.add_argument("--upload-dir", type=str, required=True)
+    parser.add_argument("--experiment", type=str, required=True)
+
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    args = get_args()
+
+    df = pd.read_table(args.annotation_file)
+
+    cur_df = df.copy()
+    cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
+    cur_df.insert(6, 'prediction', None)
+    for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
+        pred = json.loads(pred)
+        cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
+
+    cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
diff --git a/scripts/convert_mmvet_for_eval.py b/scripts/convert_mmvet_for_eval.py
new file mode 100644
index 0000000..9afaa39
--- /dev/null
+++ b/scripts/convert_mmvet_for_eval.py
@@ -0,0 +1,18 @@
+import os
+import json
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--src", type=str)
+parser.add_argument("--dst", type=str)
+args = parser.parse_args()
+
+cur_result = {}
+
+for line in open(args.src):
+    data = json.loads(line)
+    qid = data['question_id']
+    cur_result[f'v1_{qid}'] = data['text']
+
+with open(args.dst, 'w') as f:
+    json.dump(cur_result, f, indent=2)
diff --git a/scripts/convert_seed_for_submission.py b/scripts/convert_seed_for_submission.py
new file mode 100644
index 0000000..1a87f43
--- /dev/null
+++ b/scripts/convert_seed_for_submission.py
@@ -0,0 +1,74 @@
+import os
+import json
+import argparse
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--annotation-file", type=str)
+    parser.add_argument("--result-file", type=str)
+    parser.add_argument("--result-upload-file", type=str)
+    return parser.parse_args()
+
+
+def eval_single(result_file, eval_only_type=None):
+    results = {}
+    for line in open(result_file):
+        row = json.loads(line)
+        results[row['question_id']] = row
+
+    type_counts = {}
+    correct_counts = {}
+    for question_data in data['questions']:
+        if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue
+        data_type = question_data['question_type_id']
+        type_counts[data_type] = type_counts.get(data_type, 0) + 1
+        try:
+            question_id = int(question_data['question_id'])
+        except:
+            question_id = question_data['question_id']
+        if question_id not in results:
+            correct_counts[data_type] = correct_counts.get(data_type, 0)
+            continue
+        row = results[question_id]
+        if row['text'] == question_data['answer']:
+            correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
+
+    total_count = 0
+    total_correct = 0
+    for data_type in sorted(type_counts.keys()):
+        accuracy = correct_counts[data_type] / type_counts[data_type] * 100
+        if eval_only_type is None:
+            print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
+
+        total_count += type_counts[data_type]
+        total_correct += correct_counts[data_type]
+
+    total_accuracy = total_correct / total_count * 100
+    if eval_only_type is None:
+        print(f"Total accuracy: {total_accuracy:.2f}%")
+    else:
+        print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
+
+    return results
+
+if __name__ == "__main__":
+    args = get_args()
+    data = json.load(open(args.annotation_file))
+    ques_type_id_to_name = {id:n for n,id in data['question_type'].items()}
+
+    results = eval_single(args.result_file)
+    eval_single(args.result_file, eval_only_type='image')
+    eval_single(args.result_file, eval_only_type='video')
+
+    with open(args.result_upload_file, 'w') as fp:
+        for question in data['questions']:
+            qid = question['question_id']
+            if qid in results:
+                result = results[qid]
+            else:
+                result = results[int(qid)]
+            fp.write(json.dumps({
+                'question_id': qid,
+                'prediction': result['text']
+            }) + '\n')
diff --git a/scripts/convert_sqa_to_llava.py b/scripts/convert_sqa_to_llava.py
new file mode 100644
index 0000000..4c9a756
--- /dev/null
+++ b/scripts/convert_sqa_to_llava.py
@@ -0,0 +1,88 @@
+import json
+import os
+import fire
+import re
+from convert_sqa_to_llava_base_prompt import build_prompt_chatbot
+
+
+def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+
+    split_problems = build_prompt_chatbot(
+        problems, split_indices, prompt_format,
+        use_caption=False, is_test=False)
+
+    target_format = []
+    for prob_id, (input, output) in split_problems.items():
+        if input.startswith('Question: '):
+            input = input.replace('Question: ', '')
+        if output.startswith('Answer: '):
+            output = output.replace('Answer: ', '')
+
+        raw_prob_data = problems[prob_id]
+        if raw_prob_data['image'] is None:
+            target_format.append({
+                "id": prob_id,
+                "conversations": [
+                    {'from': 'human', 'value': f"{input}"},
+                    {'from': 'gpt', 'value': f"{output}"},
+                ],
+            })
+
+        else:
+            target_format.append({
+                "id": prob_id,
+                "image": os.path.join(prob_id, raw_prob_data['image']),
+                "conversations": [
+                    {'from': 'human', 'value': f"{input}\n<image>"},
+                    {'from': 'gpt', 'value': f"{output}"},
+                ],
+            })
+
+    print(f'Number of samples: {len(target_format)}')
+
+    with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
+        json.dump(target_format, f, indent=2)
+
+
+def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+
+    split_problems = build_prompt_chatbot(
+        problems, split_indices, prompt_format,
+        use_caption=False, is_test=False)
+
+    writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
+    for prob_id, (input, output) in split_problems.items():
+        if input.startswith('Question: '):
+            input = input.replace('Question: ', '')
+        if output.startswith('Answer: '):
+            output = output.replace('Answer: ', '')
+
+        raw_prob_data = problems[prob_id]
+        if raw_prob_data['image'] is None:
+            data = {
+                "id": prob_id,
+                "instruction": f"{input}",
+                "output": f"{output}",
+            }
+
+        else:
+            data = {
+                "id": prob_id,
+                "image": os.path.join(prob_id, raw_prob_data['image']),
+                "instruction": f"{input}\n<image>",
+                "output": f"{output}",
+            }
+        writer.write(json.dumps(data) + '\n')
+    writer.close()
+
+
+def main(task, **kwargs):
+    globals()[task](**kwargs)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/scripts/convert_sqa_to_llava_base_prompt.py b/scripts/convert_sqa_to_llava_base_prompt.py
new file mode 100644
index 0000000..cd7d0ee
--- /dev/null
+++ b/scripts/convert_sqa_to_llava_base_prompt.py
@@ -0,0 +1,334 @@
+def get_question_text(problem):
+    question = problem['question']
+    return question
+
+
+def get_context_text(problem, use_caption):
+    txt_context = problem['hint']
+    img_context = problem['caption'] if use_caption else ""
+    context = " ".join([txt_context, img_context]).strip()
+    if context == "":
+        context = "N/A"
+    return context
+
+
+def get_choice_text(probelm, options):
+    choices = probelm['choices']
+    choice_list = []
+    for i, c in enumerate(choices):
+        choice_list.append("({}) {}".format(options[i], c))
+    choice_txt = " ".join(choice_list)
+    #print(choice_txt)
+    return choice_txt
+
+
+def get_answer(problem, options):
+    return options[problem['answer']]
+
+
+def get_lecture_text(problem):
+    # \\n: GPT-3 can generate the lecture with more tokens.
+    lecture = problem['lecture'].replace("\n", "\\n")
+    return lecture
+
+
+def get_solution_text(problem):
+    # \\n: GPT-3 can generate the solution with more tokens
+    solution = problem['solution'].replace("\n", "\\n")
+    return solution
+
+
+def create_one_example_chatbot(format, question, context, choice, answer, lecture, solution, test_example=True):
+
+    input_format, output_format = format.split("-")
+
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == 'A':
+        output = f"Answer: The answer is {answer}."
+
+    elif output_format == 'AL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == 'AE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == 'ALE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == 'AEL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+
+    elif output_format == 'LA':
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == 'EA':
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == 'LEA':
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == 'ELA':
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+    elif output_format == 'LEPA':
+        output = ''
+        if len(lecture.strip()) > 0:
+            output += f"LECTURE: {lecture}\n"
+        if len(solution.strip()) > 0:
+            output += f"SOLUTION: {solution}\n"
+        output += '###\n'
+        output += f"ANSWER: {answer}."
+
+    input = input.replace("  ", " ").strip()
+    output = output.replace("  ", " ").strip()
+    if input.endswith("BECAUSE:"):
+        input = input.replace("BECAUSE:", "").strip()
+    if output.endswith("BECAUSE:"):
+        output = output.replace("BECAUSE:", "").strip()
+    return input, output
+
+
+def create_one_example(format, question, context, choice, answer, lecture, solution, test_example=True):
+
+    input_format, output_format = format.split("-")
+
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == 'A':
+        output = f"Answer: The answer is {answer}."
+
+    elif output_format == 'AL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == 'AE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == 'ALE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == 'AEL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+
+    elif output_format == 'LA':
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == 'EA':
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == 'LEA':
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == 'ELA':
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+
+    text = input + output
+    text = text.replace("  ", " ").strip()
+    if text.endswith("BECAUSE:"):
+        text = text.replace("BECAUSE:", "").strip()
+    return text
+
+
+
+def create_one_example_gpt4(format, question, context, choice, answer, lecture, solution, test_example=True):
+
+    input_format, output_format = format.split("-")
+
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == 'A':
+        output = f"Answer: The answer is {answer}."
+
+    elif output_format == 'AL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == 'AE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == 'ALE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == 'AEL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+
+    elif output_format == 'LA':
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == 'EA':
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == 'LEA':
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == 'ELA':
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+
+    input = input.replace("  ", " ").strip()
+    output = output.replace("  ", " ").strip()
+    if output.endswith("BECAUSE:"):
+        output = output.replace("BECAUSE:", "").strip()
+
+    user_prompt = {"role": "user", "content": f"Can you explain {input}?"}
+    assistant_prompt = {"role": "assistant", "content": f"{output}"}
+
+    return user_prompt, assistant_prompt
+
+
+def build_prompt_chatbot(problems, shot_qids, prompt_format, use_caption=False, options=["A", "B", "C", "D", "E"], is_test=False):
+    examples = {}
+
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], use_caption)
+        choice = get_choice_text(problems[qid], options)
+        answer = get_answer(problems[qid], options)
+        lecture = get_lecture_text(problems[qid]).replace('\\n', '\n')
+        solution = get_solution_text(problems[qid]).replace('\\n', '\n')
+
+        train_example = create_one_example_chatbot(prompt_format,
+                                           question,
+                                           context,
+                                           choice,
+                                           answer,
+                                           lecture,
+                                           solution,
+                                           test_example=is_test)
+        examples[qid] = train_example
+    return examples
+
+
+def build_prompt(problems, shot_qids, test_qid, args):
+
+    examples = []
+
+    # n-shot training examples
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], args.use_caption)
+        choice = get_choice_text(problems[qid], args.options)
+        answer = get_answer(problems[qid], args.options)
+        lecture = get_lecture_text(problems[qid])
+        solution = get_solution_text(problems[qid])
+
+        train_example = create_one_example(args.prompt_format,
+                                           question,
+                                           context,
+                                           choice,
+                                           answer,
+                                           lecture,
+                                           solution,
+                                           test_example=False)
+        examples.append(train_example)
+
+    # test example
+    question = get_question_text(problems[test_qid])
+    context = get_context_text(problems[test_qid], args.use_caption)
+    choice = get_choice_text(problems[test_qid], args.options)
+    answer = get_answer(problems[test_qid], args.options)
+    lecture = get_lecture_text(problems[test_qid])
+    solution = get_solution_text(problems[test_qid])
+
+    test_example = create_one_example(args.prompt_format,
+                                      question,
+                                      context,
+                                      choice,
+                                      answer,
+                                      lecture,
+                                      solution,
+                                      test_example=True)
+    examples.append(test_example)
+
+    # create the prompt input
+    prompt_input = '\n\n'.join(examples)
+
+    return prompt_input
+
+
+def build_prompt_gpt4(problems, shot_qids, test_qid, args):
+
+    prompt_array = [{"role": "system", "content": "You are a helpful assistant."}]
+
+    # n-shot training examples
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], args.use_caption)
+        choice = get_choice_text(problems[qid], args.options)
+        answer = get_answer(problems[qid], args.options)
+        lecture = get_lecture_text(problems[qid])
+        solution = get_solution_text(problems[qid])
+
+        user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format,
+                                           question,
+                                           context,
+                                           choice,
+                                           answer,
+                                           lecture,
+                                           solution,
+                                           test_example=False)
+        prompt_array.append(user_prompt)
+        prompt_array.append(assistant_prompt)
+
+    # test example
+    question = get_question_text(problems[test_qid])
+    context = get_context_text(problems[test_qid], args.use_caption)
+    choice = get_choice_text(problems[test_qid], args.options)
+    answer = get_answer(problems[test_qid], args.options)
+    lecture = get_lecture_text(problems[test_qid])
+    solution = get_solution_text(problems[test_qid])
+
+    user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format,
+                                      question,
+                                      context,
+                                      choice,
+                                      answer,
+                                      lecture,
+                                      solution,
+                                      test_example=True)
+    prompt_array.append(user_prompt)
+    prompt_array.append(assistant_prompt)
+
+    return prompt_array
\ No newline at end of file
diff --git a/scripts/convert_vizwiz_for_submission.py b/scripts/convert_vizwiz_for_submission.py
new file mode 100644
index 0000000..e43cd45
--- /dev/null
+++ b/scripts/convert_vizwiz_for_submission.py
@@ -0,0 +1,47 @@
+import os
+import argparse
+import json
+
+from tinyllava.eval.m4c_evaluator import EvalAIAnswerProcessor
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--annotation-file', type=str, required=True)
+    parser.add_argument('--result-file', type=str, required=True)
+    parser.add_argument('--result-upload-file', type=str, required=True)
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+
+    args = parse_args()
+
+    os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
+
+    results = []
+    error_line = 0
+    for line_idx, line in enumerate(open(args.result_file)):
+        try:
+            results.append(json.loads(line))
+        except:
+            error_line += 1
+    results = {x['question_id']: x['text'] for x in results}
+    test_split = [json.loads(line) for line in open(args.annotation_file)]
+    split_ids = set([x['question_id'] for x in test_split])
+
+    print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
+
+    all_answers = []
+
+    answer_processor = EvalAIAnswerProcessor()
+
+    for x in test_split:
+        assert x['question_id'] in results
+        all_answers.append({
+            'image': x['image'],
+            'answer': answer_processor(results[x['question_id']])
+        })
+
+    with open(args.result_upload_file, 'w') as f:
+        json.dump(all_answers, f)
diff --git a/scripts/convert_vqav2_for_submission.py b/scripts/convert_vqav2_for_submission.py
new file mode 100644
index 0000000..cedd291
--- /dev/null
+++ b/scripts/convert_vqav2_for_submission.py
@@ -0,0 +1,56 @@
+import os
+import argparse
+import json
+
+from tinyllava.eval.m4c_evaluator import EvalAIAnswerProcessor
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
+    parser.add_argument('--ckpt', type=str, required=True)
+    parser.add_argument('--split', type=str, required=True)
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+
+    args = parse_args()
+
+    src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
+    test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
+    dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
+    os.makedirs(os.path.dirname(dst), exist_ok=True)
+
+    results = []
+    error_line = 0
+    for line_idx, line in enumerate(open(src)):
+        try:
+            results.append(json.loads(line))
+        except:
+            error_line += 1
+
+    results = {x['question_id']: x['text'] for x in results}
+    test_split = [json.loads(line) for line in open(test_split)]
+    split_ids = set([x['question_id'] for x in test_split])
+
+    print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
+
+    all_answers = []
+
+    answer_processor = EvalAIAnswerProcessor()
+
+    for x in test_split:
+        if x['question_id'] not in results:
+            all_answers.append({
+                'question_id': x['question_id'],
+                'answer': ''
+            })
+        else:
+            all_answers.append({
+                'question_id': x['question_id'],
+                'answer': answer_processor(results[x['question_id']])
+            })
+
+    with open(dst, 'w') as f:
+        json.dump(all_answers, open(dst, 'w'))
diff --git a/scripts/extract_mm_projector.py b/scripts/extract_mm_projector.py
new file mode 100644
index 0000000..af7e6a6
--- /dev/null
+++ b/scripts/extract_mm_projector.py
@@ -0,0 +1,47 @@
+"""
+This is just a utility that I use to extract the projector for quantized models.
+It is NOT necessary at all to train, or run inference/serve demos.
+Use this script ONLY if you fully understand its implications.
+"""
+
+
+import os
+import argparse
+import torch
+import json
+from collections import defaultdict
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Extract MMProjector weights')
+    parser.add_argument('--model-path', type=str, help='model folder')
+    parser.add_argument('--output', type=str, help='output file')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    keys_to_match = ['mm_projector']
+    ckpt_to_key = defaultdict(list)
+    try:
+        model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
+        for k, v in model_indices['weight_map'].items():
+            if any(key_match in k for key_match in keys_to_match):
+                ckpt_to_key[v].append(k)
+    except FileNotFoundError:
+        # Smaller models or model checkpoints saved by DeepSpeed.
+        v = 'pytorch_model.bin'
+        for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
+            if any(key_match in k for key_match in keys_to_match):
+                ckpt_to_key[v].append(k)
+
+    loaded_weights = {}
+
+    for ckpt_name, weight_keys in ckpt_to_key.items():
+        ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
+        for k in weight_keys:
+            loaded_weights[k] = ckpt[k]
+
+    torch.save(loaded_weights, args.output)
diff --git a/scripts/finetune.sh b/scripts/finetune.sh
new file mode 100644
index 0000000..c36c3ea
--- /dev/null
+++ b/scripts/finetune.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
+
+# Uncomment and set the following variables correspondingly to run this script:
+
+################## VICUNA ##################
+# PROMPT_VERSION=v1
+# MODEL_VERSION="vicuna-v1-3-7b"
+################## VICUNA ##################
+
+################## LLaMA-2 ##################
+# PROMPT_VERSION="llava_llama_2"
+# MODEL_VERSION="llama-2-7b-chat"
+################## LLaMA-2 ##################
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/llava_instruct_80k.json \
+    --image_folder /path/to/coco/train2017 \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
diff --git a/scripts/finetune_full_schedule.sh b/scripts/finetune_full_schedule.sh
new file mode 100644
index 0000000..2ae157c
--- /dev/null
+++ b/scripts/finetune_full_schedule.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
+
+# Uncomment and set the following variables correspondingly to run this script:
+
+################## VICUNA ##################
+# PROMPT_VERSION=v1
+# MODEL_VERSION="vicuna-v1-3-7b"
+################## VICUNA ##################
+
+################## LLaMA-2 ##################
+# PROMPT_VERSION="llava_llama_2"
+# MODEL_VERSION="llama-2-7b-chat"
+################## LLaMA-2 ##################
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/llava_instruct_158k.json \
+    --image_folder /path/to/coco/train2017 \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
diff --git a/scripts/finetune_lora.sh b/scripts/finetune_lora.sh
new file mode 100644
index 0000000..0456106
--- /dev/null
+++ b/scripts/finetune_lora.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
+
+# Uncomment and set the following variables correspondingly to run this script:
+
+################## VICUNA ##################
+# PROMPT_VERSION=v1
+# MODEL_VERSION="vicuna-v1-3-7b"
+################## VICUNA ##################
+
+################## LLaMA-2 ##################
+# PROMPT_VERSION="llava_llama_2"
+# MODEL_VERSION="llama-2-7b-chat"
+################## LLaMA-2 ##################
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --lora_enable True \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/llava_instruct_80k.json \
+    --image_folder /path/to/coco/train2017 \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True \
+    --dataloader_num_workers 4 \
+    --report_to wandb
diff --git a/scripts/finetune_qlora.sh b/scripts/finetune_qlora.sh
new file mode 100644
index 0000000..05744d9
--- /dev/null
+++ b/scripts/finetune_qlora.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
+
+# Uncomment and set the following variables correspondingly to run this script:
+
+################## VICUNA ##################
+# PROMPT_VERSION=v1
+# MODEL_VERSION="vicuna-v1-3-7b"
+################## VICUNA ##################
+
+################## LLaMA-2 ##################
+# PROMPT_VERSION="llava_llama_2"
+# MODEL_VERSION="llama-2-7b-chat"
+################## LLaMA-2 ##################
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --lora_enable True \
+    --bits 4 \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/llava_instruct_80k.json \
+    --image_folder /path/to/coco/train2017 \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True \
+    --dataloader_num_workers 4 \
+    --report_to wandb
diff --git a/scripts/finetune_sqa.sh b/scripts/finetune_sqa.sh
new file mode 100644
index 0000000..146a8cb
--- /dev/null
+++ b/scripts/finetune_sqa.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path lmsys/vicuna-13b-v1.3 \
+    --version $PROMPT_VERSION \
+    --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \
+    --image_folder /Data/ScienceQA/data/scienceqa/images/train \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \
+    --num_train_epochs 12 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
diff --git a/scripts/merge_lora_weights.py b/scripts/merge_lora_weights.py
new file mode 100644
index 0000000..b97d8aa
--- /dev/null
+++ b/scripts/merge_lora_weights.py
@@ -0,0 +1,22 @@
+import argparse
+from tinyllava.model.builder import load_pretrained_model
+from tinyllava.mm_utils import get_model_name_from_path
+
+
+def merge_lora(args):
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
+
+    model.save_pretrained(args.save_model_path)
+    tokenizer.save_pretrained(args.save_model_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, required=True)
+    parser.add_argument("--model-base", type=str, required=True)
+    parser.add_argument("--save-model-path", type=str, required=True)
+
+    args = parser.parse_args()
+
+    merge_lora(args)
diff --git a/scripts/pretrain.sh b/scripts/pretrain.sh
new file mode 100644
index 0000000..cb70599
--- /dev/null
+++ b/scripts/pretrain.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
+
+# Uncomment and set the following variables correspondingly to run this script:
+
+# MODEL_VERSION=vicuna-v1-3-7b
+# MODEL_VERSION=llama-2-7b-chat
+
+########### DO NOT CHANGE ###########
+########### USE THIS FOR BOTH ###########
+PROMPT_VERSION=plain
+########### DO NOT CHANGE ###########
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path /path/to/pretrain_data.json \
+    --image_folder /path/to/images \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --tune_mm_mlp_adapter True \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 24000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
diff --git a/scripts/pretrain_xformers.sh b/scripts/pretrain_xformers.sh
new file mode 100644
index 0000000..17c6fa4
--- /dev/null
+++ b/scripts/pretrain_xformers.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Uncomment and set the following variables correspondingly to run this script:
+
+# MODEL_VERSION=vicuna-v1-3-7b
+# MODEL_VERSION=llama-2-7b-chat
+
+########### DO NOT CHANGE ###########
+########### USE THIS FOR BOTH ###########
+PROMPT_VERSION=plain
+########### DO NOT CHANGE ###########
+
+deepspeed llava/train/train_xformers.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path /path/to/pretrain_data.json \
+    --image_folder /path/to/images \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --tune_mm_mlp_adapter True \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 False \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 24000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
diff --git a/scripts/sqa_eval_batch.sh b/scripts/sqa_eval_batch.sh
new file mode 100644
index 0000000..ad857ae
--- /dev/null
+++ b/scripts/sqa_eval_batch.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+CHUNKS=8
+for IDX in {0..7}; do
+    CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \
+        --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
+        --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
+        --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \
+        --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \
+        --num-chunks $CHUNKS \
+        --chunk-idx $IDX \
+        --conv-mode llava_v1 &
+done
diff --git a/scripts/sqa_eval_gather.sh b/scripts/sqa_eval_gather.sh
new file mode 100644
index 0000000..d44904d
--- /dev/null
+++ b/scripts/sqa_eval_gather.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+CHUNKS=8
+output_file="test_llava-13b.jsonl"
+
+# Clear out the output file if it exists.
+> "$output_file"
+
+# Loop through the indices and concatenate each file.
+for idx in $(seq 0 $((CHUNKS-1))); do
+  cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file"
+done
+
+python llava/eval/eval_science_qa.py \
+    --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \
+    --result-file ./test_llava-13b.jsonl \
+    --output-file ./test_llava-13b_output.json \
+    --output-result ./test_llava-13b_result.json
diff --git a/scripts/tiny_llava/.ipynb_checkpoints/finetune-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/finetune-checkpoint.sh
new file mode 100644
index 0000000..9d68242
--- /dev/null
+++ b/scripts/tiny_llava/.ipynb_checkpoints/finetune-checkpoint.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+if [ $# -ne 4 ]; then
+    echo "Usage: $0 <LLM_VERSION> <VT_VERSION> <IMAGE_PATH> <DATA_PATH>"
+    exit 1
+fi
+
+# Assign the arguments to variables
+LLM_VERSION="$1"
+VT_VERSION="$2"
+DATA_PATH="$3"
+IMAGE_PATH="$4"
+
+# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+# VT_VERSION=openai/clip-vit-base-patch16
+# DATA_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data/text_files/llava_v1_5_mix665k.json
+# IMAGE_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data
+VT_VARIANT="${VT_VERSION#*/}"
+
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path $LLM_VERSION \
+    --version v1 \
+    --data_path $DATA_PATH \
+    --image_folder $IMAGE_PATH\
+    --vision_tower $VT_VERSION \
+    --pretrain_mm_mlp_adapter ./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-pretrain/mm_projector.bin \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --image_aspect_ratio pad \
+    --group_by_modality_length True \
+    --fp16 True \
+    --output_dir ./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 30 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name llava-finetune-tinyllama1.1B-${VT_VARIANT}
diff --git a/scripts/tiny_llava/.ipynb_checkpoints/finetune_lora-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/finetune_lora-checkpoint.sh
new file mode 100644
index 0000000..ad669f0
--- /dev/null
+++ b/scripts/tiny_llava/.ipynb_checkpoints/finetune_lora-checkpoint.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+if [ $# -ne 4 ]; then
+    echo "Usage: $0 <LLM_VERSION> <VT_VERSION> <IMAGE_PATH> <DATA_PATH>"
+    exit 1
+fi
+
+# Assign the arguments to variables
+LLM_VERSION="$1"
+VT_VERSION="$2"
+DATA_PATH="$3"
+IMAGE_PATH="$4"
+
+
+VT_VARIANT="${VT_VERSION#*/}"
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+deepspeed llava/train/train_mem.py \
+    --lora_enable True --lora_r 128 --lora_alpha 256 \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path ./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain \
+    --version  tiny_llama \
+    --data_path $DATA_PATH \
+    --image_folder $IMAGE_PATH \
+    --vision_tower $VT_VERSION \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --image_aspect_ratio pad \
+    --group_by_modality_length True \
+    --fp16 True \
+    --output_dir ./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 30 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name tiny-llava-v1.5-finetune-lora-${LLM_VARIANT}-${VT_VARIANT}
diff --git a/scripts/tiny_llava/.ipynb_checkpoints/pretrain-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/pretrain-checkpoint.sh
new file mode 100755
index 0000000..9a6dc60
--- /dev/null
+++ b/scripts/tiny_llava/.ipynb_checkpoints/pretrain-checkpoint.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+if [ $# -ne 4 ]; then
+    echo "Usage: $0 <LLM_VERSION> <VT_VERSION> <DATA_PATH> <IMAGE_PATH>"
+    exit 1
+fi
+
+# Assign the arguments to variables
+LLM_VERSION="$1"
+VT_VERSION="$2"
+DATA_PATH="$3"
+IMAGE_PATH="$4"
+
+echo "$VT_VERSION"
+# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+# VT_VERSION=openai/clip-vit-base-patch16
+# DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json
+# IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images
+VT_VARIANT="${VT_VERSION#*/}"
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path $LLM_VERSION \
+    --version plain \
+    --data_path  $DATA_PATH\
+    --image_folder $IMAGE_PATH \
+    --vision_tower $VT_VERSION \
+    --mm_projector_type mlp2x_gelu \
+    --tune_mm_mlp_adapter True \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --fp16 True \
+    --output_dir ./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-pretrain \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 32 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 2400 \
+    --save_total_limit 1 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 30 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name llava-pretrain-tinyllama1.1B-${VT_VARIANT}
diff --git a/scripts/tiny_llava/.ipynb_checkpoints/pretrain_baselines-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/pretrain_baselines-checkpoint.sh
new file mode 100644
index 0000000..9179c8f
--- /dev/null
+++ b/scripts/tiny_llava/.ipynb_checkpoints/pretrain_baselines-checkpoint.sh
@@ -0,0 +1,9 @@
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+VT_VERSIONS=(openai/clip-vit-large-patch14-336 openai/clip-vit-large-patch14 openai/clip-vit-base-patch16 openai/clip-vit-large-patch32)
+DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json
+IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images
+
+
+for VT_VERSION in "${VT_VERSIONS[@]}"; do
+    bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH"
+done
\ No newline at end of file
diff --git a/scripts/tiny_llava/.ipynb_checkpoints/train_baselines-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/train_baselines-checkpoint.sh
new file mode 100644
index 0000000..c01c2a5
--- /dev/null
+++ b/scripts/tiny_llava/.ipynb_checkpoints/train_baselines-checkpoint.sh
@@ -0,0 +1,15 @@
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+VT_VERSIONS=(openai/clip-vit-large-patch14 openai/clip-vit-base-patch16)
+DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json
+FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/llava_v1_5_mix665k.json
+IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images
+FINETUNE_IMAGE_PATH=/root/autodl-tmp/data
+
+# bash scripts/tiny_llava/finetune.sh "$LLM_VERSION" openai/clip-vit-large-patch14-336 "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH"
+for VT_VERSION in "${VT_VERSIONS[@]}"; do
+    # bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH"
+    bash scripts/tiny_llava/finetune.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH"
+done
+
+bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" openai/clip-vit-base-patch32 "$DATA_PATH" "$IMAGE_PATH"
+bash scripts/tiny_llava/finetune.sh "$LLM_VERSION" openai/clip-vit-base-patch32 "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH"
\ No newline at end of file
diff --git a/scripts/tiny_llava/.ipynb_checkpoints/train_baselines_phi_jia-checkpoint.sh b/scripts/tiny_llava/.ipynb_checkpoints/train_baselines_phi_jia-checkpoint.sh
new file mode 100644
index 0000000..d85efef
--- /dev/null
+++ b/scripts/tiny_llava/.ipynb_checkpoints/train_baselines_phi_jia-checkpoint.sh
@@ -0,0 +1,18 @@
+LLM_VERSION=susnato/phi-1_5_dev
+# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+VT_VERSIONS=(openai/clip-vit-large-patch14-336)
+#DATA_PATH=/root/autodl-tmp/data/text_files/really_cleaned_share-captioner_coco_lcs_sam_1246k_1107.json
+DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json
+#FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/cleaned_sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json
+FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/llava_v1_5_mix665k.json
+#IMAGE_PATH=/root/autodl-tmp/data/
+#FINETUNE_IMAGE_PATH=/root/autodl-tmp/data
+IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images
+FINETUNE_IMAGE_PATH=/root/autodl-tmp/data
+
+
+for VT_VERSION in "${VT_VERSIONS[@]}"; do
+     bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH"
+    bash scripts/tiny_llava/finetune_lora.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH"
+done
+
diff --git a/scripts/tiny_llava/LOGS/LOG.md b/scripts/tiny_llava/LOGS/LOG.md
new file mode 100644
index 0000000..2a81851
--- /dev/null
+++ b/scripts/tiny_llava/LOGS/LOG.md
@@ -0,0 +1,561 @@
+# 实验1：unlock-vit-from-12-tune-entire-model
+## 实验时间：2024年1月30日23点10分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain lr&batch size: 2e-5 256
+  * finetune lr&batch size: 2e-5 128
+  * data: standard llava-1.5
+  * data type: fp16
+## 训练策略：
+预训练：将ViT, MLP和LLM同时打开，跟随ShareGPT4V的论文，ViT从第12层开始打开
+微调：与LLaVA一致
+## 实验结果：
+  * GQA: 58.28
+  * SQA: 57.06
+  * TextVQA: 43.17
+  * VQAv2: 74.02
+  * VizWiz:
+  * MMVet:
+  * POPE: adversarial: 0.835 random: 0.876 popular: 0.869
+## 实验分析：
+本次实验中TextVQA和baseline(46.37)的效果差很多，我认为有可能是因为微调CLIP使CLIP的泛化性受到损伤，而TextVQA这个任务是非常细粒度的任务，导致效果减少最大。如果要提升效果，应当从更好的数据（ShareGPT4V尝试）
+
+# 实验2：unlock-vit-from-18-tune-entire-model
+## 实验时间：2024年1月31日13点37分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 18
+  * pretrain lr&batch size: 2e-5 256
+  * finetune lr&batch size: 2e-5 128
+  * data: standard llava-1.5
+  * data type: fp16
+## 训练策略：
+预训练：将ViT, MLP和LLM同时打开，ViT从第18层开始打开
+微调：与LLaVA一致
+## 实验结果：
+  * GQA: 58.32
+  * SQA: 54.24
+  * TextVQA: 43.44
+  * VQAv2: 73.89
+  * VizWiz:
+  * POPE: adversarial: 0.840 random: 0.876 popular: 0.870
+## 实验分析：
+本次实验中TextVQA和SQA与baseline(46.37， 59.4)的效果差很多，和上组实验的分析相同，应该是实验数据对CLIP的泛化性损伤了。
+
+# 实验3：unlock-vit-from-21-tune-entire-model
+## 实验时间：2024年1月30日13点37分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 21
+  * pretrain lr&batch size: 2e-5 256
+  * finetune lr&batch size: 2e-5 128
+  * data: standard llava-1.5
+  * data type: fp16
+## 训练策略：
+预训练：将ViT, MLP和LLM同时打开，ViT从第21层开始打开
+微调：与LLaVA一致
+## 实验结果：
+  * GQA: 58.17
+  * SQA: 58.25
+  * TextVQA: 43.93
+  * VQAv2:
+  * VizWiz:
+  * POPE: adversarial: 0.838 random: 0.875 popular: 0.867
+## 实验分析：
+SQA的表现与从18打开相比略好，但12, 18, 21之间没有观察到可见规律，需要看看第15层打开时什么情况
+
+# 实验4：standard-llava-transformers-4.36.1
+## 实验时间：2024年2月1日13点37分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: No
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: standard llava-1.5
+  * data type: fp16
+## 训练策略：
+预训练：与LLaVA一致
+微调：与LLaVA一致
+## 实验结果：
+  * GQA: 58.05
+  * SQA: 60.24
+  * TextVQA: 45.83
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+这个仅是baseline在transformer版本升级后的复现，不应该有什么变化，但是由于tokenizers版本的升级，会出现mismatch，更新train.py的代码后（use_fast=True或使用LLaVA-1.6的补丁）可以兼容升级。
+TextVQA的成绩略有下降
+
+# 实验5：sharegpt4v-unlock-vit-from-18-tune-entire-model
+## 实验时间：2024年2月2日10点40分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 18
+  * mm_mlp_pretrain: standard-llava-transformers-4.36.1's pretrain mlp
+  * pretrain lr&batch size: 2e-5 256 (2 gradient accumulation steps)
+  * finetune lr&batch size: 2e-5 128
+  * data: sharegpt4v
+  * data type: fp16
+## 训练策略：
+预训练：MLP使用standard-llava-transformers-4.36.1初始化，在sharegpt4v的pretrain数据上对齐
+微调：与ShareGPT4V一致
+## 实验结果：
+  * GQA: 59.43
+  * SQA: 58.7
+  * TextVQA: 48.22
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+总是报OOM的错，不得不把gradient accumulation step调为2。实验时间很长全部完成大约需要6至7小时。忘记上传sharetext_vqa数据集，现已上传。
+这次实验在总体上效果有一些提升，特别是TextVQA部分，提升显著，这与训练数据有强相关性，这次实验说明，想要训练更好的模型，不光要从参数量考虑，也要从数据质量考虑。但是让我感到奇怪的是，为什么在预训练过程中loss会降到这么低(低于0.5)？需要进一步探究。
+
+# 实验6：sharegpt4v-unlock-vit-from-12-tune-entire-model
+## 实验时间：2024年2月2日20点00分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * mm_mlp_pretrain: standard-llava-transformers-4.36.1's pretrain mlp
+  * pretrain lr&batch size: 2e-5 256 (2 gradient accumulation steps)
+  * finetune lr&batch size: 2e-5 128
+  * data: sharegpt4v
+  * data type: fp16
+## 训练策略：
+预训练：MLP使用standard-llava-transformers-4.36.1初始化，在sharegpt4v的pretrain数据上对齐
+微调：与ShareGPT4V一致
+## 实验结果：
+  * GQA: 59.43
+  * SQA: 58.80
+  * TextVQA: 48.05
+  * VQAv2: 75.24
+  * VizWiz: 34.74
+  * MMVet: 25.1
+  * POPE: adversarial: 0.839 random: 0.880 popular: 0.858
+## 实验分析：
+ShareGPT4V的论文声称，将ViT从第12层打开能够取得最好的效果，这个实验是对该结论的验证。
+
+# 实验7：sharegpt4v-unlock-vit-from-15-tune-entire-model
+## 实验时间：2024年2月2日20点00分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 15
+  * mm_mlp_pretrain: standard-llava-transformers-4.36.1's pretrain mlp
+  * pretrain lr&batch size: 2e-5 256 (2 gradient accumulation steps)
+  * finetune lr&batch size: 2e-5 128
+  * data: sharegpt4v
+  * data type: fp16
+## 训练策略：
+预训练：MLP使用standard-llava-transformers-4.36.1初始化，在sharegpt4v的pretrain数据上对齐
+微调：与ShareGPT4V一致
+## 实验结果：
+  * GQA: 59.43
+  * SQA: 58.95
+  * TextVQA: 48.18
+  * VQAv2: 75.23
+  * VizWiz:
+  * MMVet: 24
+  * POPE: adversarial: 0.840 random: 0.880 popular: 0.860
+## 实验分析：
+消融实验，与从第12层，18层组成一组消融实验，有时间还可以做21层
+
+# 实验8：moe-mlp-unlock-vit-from-12-tune-entire-model
+## 实验时间：2024年2月2日20点00分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain: 使用sharegpt4v-unlock-vit-from-12-tune-entire-model作为初始化
+  * finetune lr&batch size: 2e-5 128
+  * data: sharegpt4v
+  * data type: fp16
+## 训练策略：
+微调：与ShareGPT4V一致
+## 实验结果：
+  * GQA:
+  * SQA:
+  * TextVQA:
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+FAILED
+
+
+# 实验9：stablelm-standard-data
+## 实验时间：2024年2月3日22点29分
+## 实验重要参数：
+  * LLM: stabilityai/stablelm-2-zephyr-1_6b
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: No
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: standard llava-1.5
+  * data type: bf16
+## 训练策略：
+预训练：与LLaVA-1.5一致
+微调：与LLaVA-1.5一致
+## 实验结果：
+  * GQA: 58.86
+  * SQA: 62.82
+  * TextVQA: 49.52
+  * VQAv2: 74.9
+  * VizWiz:
+  * MMVet: 25.0
+  * POPE: adversarial: 0.840 random: 0.872 popular: 0.863
+## 实验分析：
+
+
+# 实验10：stablelm-sharegpt4v-unlock-vit-from-12
+## 实验时间：2024年2月4日14点02分
+## 实验重要参数：
+  * LLM: stabilityai/stablelm-2-zephyr-1_6b
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain lr&batch size: 2e-5 256
+  * finetune lr&batch size: 2e-5 128
+  * data: sharegpt4v
+  * data type: bf16
+## 训练策略：
+预训练：与ShareGPT4V一致
+微调：与ShareGPT4V一致
+## 实验结果：
+  * GQA: 60.26
+  * SQA: 63.06
+  * TextVQA: 51.6
+  * VQAv2: 76.34
+  * VizWiz: 36.34
+  * MMVet: 29.3
+  * POPE: adversarial: 0.844 random: 0.864 popular: 0.855
+## 实验分析：
+
+
+# 实验11：stablelm-sharegpt4v
+## 实验时间：2024年2月6日16点30分
+## 实验重要参数：
+  * LLM: stabilityai/stablelm-2-zephyr-1_6b
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: No
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: sharegpt4v
+  * data type: bf16
+## 训练策略：
+预训练：与LLaVA-1.5一致
+微调：与ShareGPT4V一致
+## 实验结果：
+  * GQA: 59.67
+  * SQA: 63.41
+  * TextVQA: 50.38
+  * VQAv2: 75.89
+  * VizWiz:
+  * MMVet: 27.4
+  * POPE: adversarial: 0.847 random: 0.878 popular: 0.869
+## 实验分析：
+
+
+# 实验12：minicpm-standard-data
+## 实验时间：2024年2月9日14点45分
+## 实验重要参数：
+  * LLM: openbmb/MiniCPM-2B-dpo-bf16
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: No
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: standard-data
+  * data type: bf16
+## 训练策略：
+预训练：与LLaVA-1.5一致
+微调：与LLaVA-1.5一致
+## 实验结果：
+  * GQA:
+  * SQA:
+  * TextVQA:
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+FAILED
+
+# 实验13：minicpm-sharegpt4v-unlock-vit-from-12
+## 实验时间：2024年2月9日14点45分
+## 实验重要参数：
+  * LLM: openbmb/MiniCPM-2B-dpo-bf16
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: sharegpt4v
+  * data type: bf16
+## 训练策略：
+预训练：与ShareGPT4V
+微调：与ShareGPT4V
+## 实验结果：
+  * GQA:
+  * SQA:
+  * TextVQA:
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+FAILED
+
+# 实验14：tinyllama-standard-data-siglip
+## 实验时间：2024年2月9日14点45分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: None
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: LLaVA-1.5
+  * data type: fp16
+## 训练策略：
+预训练：LLaVA-1.5
+微调：LLaVA-1.5
+## 实验结果：
+  * GQA: 58.63
+  * SQA: 60.24
+  * TextVQA: 49.06
+  * VQAv2: 75.8
+  * VizWiz:
+  * MMVet: 24.1
+  * POPE: adversarial: 0.847 random: 0.875 popular: 0.862
+## 实验分析：
+本次实验将CLIP替换成了SigLip， SigLip有729个visual tokens(分辨率为384), 似乎效果提升了？需要进一步检验其他效果。现在必须确定有哪些语言模型和视觉模型值得进一步实验，
+我认为应该确立是TinyLlama, StableLM, 和Phi，但Phi一直没有训练成功，为了保证效率，应该先训练TinyLlama和StableLM的四个版本，预计需要24小时
+
+# 实验15：phi-standard-data-siglip-lora
+## 实验时间：2024年2月9日14点45分
+## 实验重要参数：
+  * LLM: microsoft/phi-2
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: Yes
+  * Unlock ViT From: None
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-4 128 LoRA & 2e-5 mlp
+  * data: LLaVA-1.5
+  * data type: fp16
+## 训练策略：
+预训练：LLaVA-1.5
+微调：LLaVA-1.5
+## 实验结果：
+  * GQA: 58.64
+  * SQA: 67.13
+  * TextVQA: 49.96
+  * VQAv2:
+  * VizWiz:
+  * MMVet:
+  * POPE:
+## 实验分析：
+
+
+# 实验16：stablelm-standard-data-siglip
+## 实验时间：2024年2月13日10点02分
+## 实验重要参数：
+  * LLM: stabilityai/stablelm-2-zephyr-1_6b
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: None
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: LLaVA-1.5
+  * data type: bf16
+## 训练策略：
+预训练：LLaVA-1.5
+微调：LLaVA-1.5
+## 实验结果：
+  * GQA: 61.13
+  * SQA: 62.77
+  * TextVQA: 54.09
+  * VQAv2: 78.14
+  * VizWiz
+  * MMVet: 29.5
+  * POPE: adversarial: 0.853 random: 0.880 popular: 0.874
+## 实验分析：
+
+
+# 实验17：stablelm-sharegpt4v-unlock-vit-from-12-siglip
+## 实验时间：2024年2月13日10点02分
+## 实验重要参数：
+  * LLM: stabilityai/stablelm-2-zephyr-1_6b
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: ShareGPT4V
+  * data type: bf16
+## 训练策略：
+预训练：ShareGPT4V
+微调：ShareGPT4V
+## 实验结果：
+  * GQA: 61.93
+  * SQA: 64.70
+  * TextVQA: 56.39
+  * VQAv2: 78.91
+  * VizWiz:
+  * MMVet: 32.6
+  * POPE: adversarial: 0.851 random: 0.878 popular: 0.867
+## 实验分析：
+
+
+# 实验18：tinyllama-sharegpt4v-unlock-vit-from-12-siglip
+## 实验时间：2024年2月9日14点45分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: ShareGPT4V
+  * data type: fp16
+## 训练策略：
+预训练：ShareGPT4V
+微调：ShareGPT4V
+## 实验结果：
+  * GQA: 60.25
+  * SQA: 60.14
+  * TextVQA: 51.68
+  * VQAv2: 76.89
+  * VizWiz:
+  * MMVet: 25.8
+  * POPE: adversarial: 0.847 random: 0.875 popular: 0.862
+## 实验分析：
+
+# 实验19：phi-standard-data-siglip
+## 实验时间：2024年2月15日14点45分
+## 实验重要参数：
+  * LLM: microsoft/phi-2
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: None
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: LLaVA-1.5
+  * data type: fp16
+## 训练策略：
+预训练：LLaVA-1.5
+微调：LLaVA-1.5
+## 实验结果：
+  * GQA: 61.34
+  * SQA: 69.91
+  * TextVQA: 55.64
+  * VQAv2: 79.2
+  * VizWiz: 38.45
+  * MMVet: 32.1
+  * POPE: adversarial: 0.857 random: 0.885 popular: 0.871
+  * LLaVAW: 67.9
+## 实验分析：
+
+# 实验20：phi-sharegpt4v-unlock-vit-from-12-siglip
+## 实验时间：2024年2月16日14点45分
+## 实验重要参数：
+  * LLM: microsoft/phi-2
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: ShareGPT4V
+  * data type: fp16
+## 训练策略：
+预训练：ShareGPT4V
+微调：ShareGPT4V
+## 实验结果：
+  * GQA: 61.97
+  * SQA: 69.06
+  * TextVQA: 59.13
+  * VQAv2: 79.93
+  * VizWiz: 34.42? weird
+  * MMVet: 32.0
+  * POPE: adversarial: 0.856 random: 0.873 popular: 0.863
+  * LLaVAW: 75.8
+## 实验分析：
+
+# 实验21：phi-standard-data
+## 实验时间：2024年2月17日14点45分
+## 实验重要参数：
+  * LLM: microsoft/phi-2
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: No
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: LLaVA-1.5
+  * data type: fp16
+## 训练策略：
+预训练：LLaVA-1.5
+微调：LLaVA-1.5
+## 实验结果：
+  * GQA:
+  * SQA:
+  * TextVQA:
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+
+
+# 实验22：phi-sharegpt4v-unlock-vit-from-12
+## 实验时间：2024年2月17日14点45分
+## 实验重要参数：
+  * LLM: microsoft/phi-2
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: ShareGPT4V
+  * data type: fp16
+## 训练策略：
+预训练：ShareGPT4V
+微调：ShareGPT4V
+## 实验结果：
+  * GQA:
+  * SQA:
+  * TextVQA:
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+
+
diff --git a/scripts/tiny_llava/LOGS/REPORT.md b/scripts/tiny_llava/LOGS/REPORT.md
new file mode 100644
index 0000000..b47ae4e
--- /dev/null
+++ b/scripts/tiny_llava/LOGS/REPORT.md
@@ -0,0 +1,12 @@
+# 1. 数据对模型效果的影响
+我们训练了v1和v1.1的版本，主要的区别就是v1版本使用的是LLaVA-1.5提供的数据，而v1.1版本使用的是ShareGPT4V的数据
+而使用ShareGPT4V数据的质量优于LLaVA-1.5，也因此有了更好的效果
+
+# 2. 中间的连接器对模型效果的影响
+我们尝试了MLP和Resampler两种方式，该部分主要比较两种方式的实现与对应的效果
+
+# 3. 微调模型不同部分对模型效果的影响
+这部分主要探究：是否打开CLIP训练， LoRA该微调哪些层，打开LLM是全量还是微调特定层（如MLP），描述该部分的消融实验
+
+# 4. 可能有时间探究，可能没有时间探究
+如果有时间，探究MoE和动态调整visual token的数量
diff --git a/scripts/tiny_llava/LOGS/organized_log.md b/scripts/tiny_llava/LOGS/organized_log.md
new file mode 100644
index 0000000..494328f
--- /dev/null
+++ b/scripts/tiny_llava/LOGS/organized_log.md
@@ -0,0 +1,215 @@
+# V1
+# 实验1：tinyllama-standard-data
+## 实验时间：2024年2月1日13点37分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: No
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: standard llava-1.5
+  * data type: fp16
+## 训练策略：
+预训练：与LLaVA一致
+微调：与LLaVA一致
+## 实验结果：
+  * GQA: 58.05
+  * SQA: 60.24
+  * TextVQA: 45.83
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+这个仅是baseline在transformer版本升级后的复现，不应该有什么变化，但是由于tokenizers版本的升级，会出现mismatch，更新train.py的代码后（use_fast=True或使用LLaVA-1.6的补丁）可以兼容升级。
+TextVQA的成绩略有下降
+
+# 实验2：stablelm-standard-data
+## 实验时间：2024年2月3日22点29分
+## 实验重要参数：
+  * LLM: stabilityai/stablelm-2-zephyr-1_6b
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: No
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: standard llava-1.5
+  * data type: bf16
+## 训练策略：
+预训练：与LLaVA-1.5一致
+微调：与LLaVA-1.5一致
+## 实验结果：
+  * GQA: 58.86
+  * SQA: 62.82
+  * TextVQA: 49.52
+  * VQAv2: 74.9
+  * VizWiz:
+## 实验分析：
+
+# 实验3：tinyllama-standard-data-siglip
+## 实验时间：2024年2月9日14点45分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: None
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: LLaVA-1.5
+  * data type: fp16
+## 训练策略：
+预训练：LLaVA-1.5
+微调：LLaVA-1.5
+## 实验结果：
+  * GQA: 58.63
+  * SQA: 60.24
+  * TextVQA: 49.06
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+本次实验将CLIP替换成了SigLip， SigLip有729个visual tokens(分辨率为384), 似乎效果提升了？需要进一步检验其他效果。现在必须确定有哪些语言模型和视觉模型值得进一步实验，
+我认为应该确立是TinyLlama, StableLM, 和Phi，但Phi一直没有训练成功，为了保证效率，应该先训练TinyLlama和StableLM的四个版本，预计需要24小时
+
+# 实验4：stablelm-standard-data-siglip
+## 实验时间：2024年2月13日10点02分
+## 实验重要参数：
+  * LLM: stabilityai/stablelm-2-zephyr-1_6b
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: None
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: LLaVA-1.5
+  * data type: bf16
+## 训练策略：
+预训练：LLaVA-1.5
+微调：LLaVA-1.5
+## 实验结果：
+  * GQA: 61.13
+  * SQA: 62.77
+  * TextVQA: 54.09
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+
+# 实验5：phi-standard-data-siglip
+## 实验时间：2024年2月9日14点45分
+## 实验重要参数：
+  * LLM: microsoft/phi-2
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: Yes
+  * Unlock ViT From: None
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-4 128 LoRA & 2e-5 mlp
+  * data: LLaVA-1.5
+  * data type: fp16
+## 训练策略：
+预训练：LLaVA-1.5
+微调：LLaVA-1.5
+## 实验结果：
+  * GQA: 58.64
+  * SQA: 67.13
+  * TextVQA: 49.96
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+
+# V1.1
+# 实验6：tinyllama-sharegpt4v-unlock-vit-from-12
+## 实验时间：2024年2月2日20点00分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * mm_mlp_pretrain: standard-llava-transformers-4.36.1's pretrain mlp
+  * pretrain lr&batch size: 2e-5 256 (2 gradient accumulation steps)
+  * finetune lr&batch size: 2e-5 128
+  * data: sharegpt4v
+  * data type: fp16
+## 训练策略：
+预训练：MLP使用standard-llava-transformers-4.36.1初始化，在sharegpt4v的pretrain数据上对齐
+微调：与ShareGPT4V一致
+## 实验结果：
+  * GQA: 59.43
+  * SQA: 58.80
+  * TextVQA: 48.05
+  * VQAv2: 75.24
+  * VizWiz: 34.74
+## 实验分析：
+ShareGPT4V的论文声称，将ViT从第12层打开能够取得最好的效果，这个实验是对该结论的验证。
+
+
+# 实验7：stablelm-sharegpt4v-unlock-vit-from-12
+## 实验时间：2024年2月4日14点02分
+## 实验重要参数：
+  * LLM: stabilityai/stablelm-2-zephyr-1_6b
+  * VT: openai/clip-vit-large-patch14-336
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain lr&batch size: 2e-5 256
+  * finetune lr&batch size: 2e-5 128
+  * data: sharegpt4v
+  * data type: bf16
+## 训练策略：
+预训练：与ShareGPT4V一致
+微调：与ShareGPT4V一致
+## 实验结果：
+  * GQA: 60.26
+  * SQA: 63.06
+  * TextVQA: 51.6
+  * VQAv2: 76.34
+  * VizWiz: 36.34
+## 实验分析：
+
+# 实验8：tinyllama-sharegpt4v-unlock-vit-from-12-siglip
+## 实验时间：2024年2月9日14点45分
+## 实验重要参数：
+  * LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: ShareGPT4V
+  * data type: fp16
+## 训练策略：
+预训练：ShareGPT4V
+微调：ShareGPT4V
+## 实验结果：
+  * GQA: 60.25
+  * SQA: 60.14
+  * TextVQA: 51.68
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
+
+# 实验9：stablelm-sharegpt4v-unlock-vit-from-12-siglip
+## 实验时间：2024年2月13日10点02分
+## 实验重要参数：
+  * LLM: stabilityai/stablelm-2-zephyr-1_6b
+  * VT: google/siglip-so400m-patch14-384
+  * CM: MLP
+  * LoRA: No
+  * Unlock ViT From: 12
+  * pretrain lr&batch size: 1e-3 256
+  * finetune lr&batch size: 2e-5 128
+  * data: ShareGPT4V
+  * data type: bf16
+## 训练策略：
+预训练：ShareGPT4V
+微调：ShareGPT4V
+## 实验结果：
+  * GQA: 61.93
+  * SQA: 64.70
+  * TextVQA: 56.39
+  * VQAv2:
+  * VizWiz:
+## 实验分析：
diff --git a/scripts/tiny_llava/docs/LOG.md b/scripts/tiny_llava/docs/LOG.md
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa-checkpoint.sh
new file mode 100644
index 0000000..9777ba8
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa-checkpoint.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+CHUNKS=${#GPULIST[@]}
+
+SPLIT="llava_gqa_testdev_balanced"
+GQADIR="/root/autodl-tmp/data/eval/gqa/"
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-base-patch32
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
+        --model-path $MODEL_PATH \
+        --question-file $EVAL_DIR/gqa/$SPLIT.jsonl \
+        --image-folder $EVAL_DIR/gqa/images \
+        --answers-file $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \
+        --num-chunks $CHUNKS \
+        --chunk-idx $IDX \
+        --temperature 0 \
+        --model-base $MODEL_BASE \
+        --conv-mode tiny_llama &
+done
+
+wait
+
+output_file=$EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/merge.jsonl
+
+# Clear out the output file if it exists.
+> "$output_file"
+
+# Loop through the indices and concatenate each file.
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    cat $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
+done
+
+python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
+
+cd $GQADIR
+python eval/eval.py --tier testdev_balanced
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa_v1-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa_v1-checkpoint.sh
new file mode 100644
index 0000000..3425efc
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/gqa_v1-checkpoint.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+CHUNKS=${#GPULIST[@]}
+
+SPLIT="llava_gqa_testdev_balanced"
+GQADIR="/root/autodl-tmp/data/eval/gqa/"
+
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
+        --model-path $MODEL_PATH \
+        --question-file $EVAL_DIR/gqa/$SPLIT.jsonl \
+        --image-folder $EVAL_DIR/gqa/images \
+        --answers-file $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \
+        --num-chunks $CHUNKS \
+        --chunk-idx $IDX \
+        --temperature 0 \
+        --conv-mode v1 &
+done
+
+wait
+
+output_file=$EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/merge.jsonl
+
+# Clear out the output file if it exists.
+> "$output_file"
+
+# Loop through the indices and concatenate each file.
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    cat $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
+done
+
+python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
+
+cd $GQADIR
+python eval/eval.py --tier testdev_balanced
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/mmbench_cn-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/mmbench_cn-checkpoint.sh
new file mode 100644
index 0000000..43e96eb
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/mmbench_cn-checkpoint.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+SPLIT="mmbench_dev_cn_20231003"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-clip-vit-large-patch14-336-finetune"
+MODEL_NAME="tiny-llava-v1-1.1b"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_mmbench \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/mmbench_cn/$SPLIT.tsv \
+    --answers-file $EVAL_DIR/mmbench_cn/answers/$SPLIT/$MODEL_NAME.jsonl \
+    --lang cn \
+    --single-pred-prompt \
+    --temperature 0 \
+    --conv-mode vicuna_v1
+
+mkdir -p $EVAL_DIR/mmbench/answers_upload/$SPLIT
+
+python scripts/convert_mmbench_for_submission.py \
+    --annotation-file $EVAL_DIR/mmbench_cn/$SPLIT.tsv \
+    --result-dir $EVAL_DIR/mmbench_cn/answers/$SPLIT \
+    --upload-dir $EVAL_DIR/mmbench_cn/answers_upload/$SPLIT \
+    --experiment $MODEL_NAME
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/pope-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/pope-checkpoint.sh
new file mode 100644
index 0000000..c012cde
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/pope-checkpoint.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+
+MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}"
+MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \
+    --image-folder $EVAL_DIR/pope/val2014 \
+    --answers-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --model-base $MODEL_BASE \
+    --conv-mode tiny_llama
+
+python llava/eval/eval_pope.py \
+    --annotation-dir $EVAL_DIR/pope/coco \
+    --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \
+    --result-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/pope_v1-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/pope_v1-checkpoint.sh
new file mode 100644
index 0000000..f45a3e6
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/pope_v1-checkpoint.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-base-patch16
+VT_VARIANT="${VT_VERSION#*/}"
+
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \
+    --image-folder $EVAL_DIR/pope/val2014 \
+    --answers-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --conv-mode v1
+
+python llava/eval/eval_pope.py \
+    --annotation-dir $EVAL_DIR/pope/coco \
+    --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \
+    --result-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa-checkpoint.sh
new file mode 100644
index 0000000..fb49171
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa-checkpoint.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}"
+MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_science \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/scienceqa/llava_test_CQM-A.json \
+    --image-folder $EVAL_DIR/scienceqa/images/test \
+    --answers-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \
+    --single-pred-prompt \
+    --temperature 0 \
+    --model-base $MODEL_BASE \
+    --conv-mode tiny_llama
+
+python llava/eval/eval_science_qa.py \
+    --base-dir $EVAL_DIR/scienceqa \
+    --result-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \
+    --output-file $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_output.jsonl \
+    --output-result $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_result.json
+    
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa_v1-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa_v1-checkpoint.sh
new file mode 100644
index 0000000..ca329b9
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/sqa_v1-checkpoint.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-base-patch16
+VT_VARIANT="${VT_VERSION#*/}"
+
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_science \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/scienceqa/llava_test_CQM-A.json \
+    --image-folder $EVAL_DIR/scienceqa/images/test \
+    --answers-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \
+    --single-pred-prompt \
+    --temperature 0 \
+    --conv-mode v1
+
+python llava/eval/eval_science_qa.py \
+    --base-dir $EVAL_DIR/scienceqa \
+    --result-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \
+    --output-file $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_output.jsonl \
+    --output-result $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_result.json
+
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa-checkpoint.sh
new file mode 100644
index 0000000..5460c46
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa-checkpoint.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-base-patch16
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}"
+MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/textvqa/llava_textvqa_val_v051_ocr.jsonl \
+    --image-folder $EVAL_DIR/textvqa/train_images \
+    --answers-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --model-base $MODEL_BASE \
+    --conv-mode tiny_llama
+
+python -m llava.eval.eval_textvqa \
+    --annotation-file $EVAL_DIR/textvqa/TextVQA_0.5.1_val.json \
+    --result-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa_v1-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa_v1-checkpoint.sh
new file mode 100644
index 0000000..9aa79c6
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/textvqa_v1-checkpoint.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}"
+
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/textvqa/llava_textvqa_val_v051_ocr.jsonl \
+    --image-folder $EVAL_DIR/textvqa/train_images \
+    --answers-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --conv-mode v1
+
+python -m llava.eval.eval_textvqa \
+    --annotation-file $EVAL_DIR/textvqa/TextVQA_0.5.1_val.json \
+    --result-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz-checkpoint.sh
new file mode 100644
index 0000000..dbe3282
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz-checkpoint.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+# MODEL_PATH="./checkpoints/tiny-llava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune"
+# MODEL_NAME="tiny-llava-v1-1.1b-sharegpt4v"
+# MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/vizwiz/llava_test.jsonl \
+    --image-folder $EVAL_DIR/vizwiz/test \
+    --answers-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --model-base $MODEL_BASE \
+    --conv-mode tiny_llama
+
+python scripts/convert_vizwiz_for_submission.py \
+    --annotation-file $EVAL_DIR/vizwiz/llava_test.jsonl \
+    --result-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \
+    --result-upload-file $EVAL_DIR/vizwiz/answers_upload/$MODEL_NAME.json
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz_v1-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz_v1-checkpoint.sh
new file mode 100644
index 0000000..f7b075a
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/vizwiz_v1-checkpoint.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+# MODEL_PATH="./checkpoints/tiny-llava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune"
+# MODEL_NAME="tiny-llava-v1-1.1b-sharegpt4v"
+# MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}"
+
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/vizwiz/llava_test.jsonl \
+    --image-folder $EVAL_DIR/vizwiz/test \
+    --answers-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --conv-mode v1
+
+python scripts/convert_vizwiz_for_submission.py \
+    --annotation-file $EVAL_DIR/vizwiz/llava_test.jsonl \
+    --result-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \
+    --result-upload-file $EVAL_DIR/vizwiz/answers_upload/$MODEL_NAME.json
diff --git a/scripts/tiny_llava/eval/.ipynb_checkpoints/vqav2-checkpoint.sh b/scripts/tiny_llava/eval/.ipynb_checkpoints/vqav2-checkpoint.sh
new file mode 100644
index 0000000..d22101a
--- /dev/null
+++ b/scripts/tiny_llava/eval/.ipynb_checkpoints/vqav2-checkpoint.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+CHUNKS=${#GPULIST[@]}
+
+SPLIT="llava_vqav2_mscoco_test-dev2015"
+
+#LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+#LLM_VARIANT="${LLM_VERSION#*/}"
+#
+#VT_VERSION=openai/clip-vit-large-patch14-336
+#VT_VARIANT="${VT_VERSION#*/}"
+#
+#MODEL_PATH="./checkpoints/tiny-llava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune"
+#MODEL_NAME="tiny-llava-v1-1.1b-sharegpt4v"
+#EVAL_DIR="/root/autodl-tmp/data/eval"
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
+        --model-path $MODEL_PATH \
+        --question-file $EVAL_DIR/vqav2/$SPLIT.jsonl \
+        --image-folder $EVAL_DIR/vqav2/test2015 \
+        --answers-file $EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \
+        --num-chunks $CHUNKS \
+        --chunk-idx $IDX \
+        --temperature 0 \
+        --model-base $MODEL_BASE
+        --conv-mode tiny_llama &
+done
+
+wait
+
+output_file=$EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/merge.jsonl
+
+# Clear out the output file if it exists.
+> "$output_file"
+
+# Loop through the indices and concatenate each file.
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    cat $EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
+done
+
+python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $MODEL_NAME --dir $EVAL_DIR/vqav2
+
diff --git a/scripts/tiny_llava/eval/gqa.sh b/scripts/tiny_llava/eval/gqa.sh
new file mode 100644
index 0000000..2828f01
--- /dev/null
+++ b/scripts/tiny_llava/eval/gqa.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+CHUNKS=${#GPULIST[@]}
+
+SPLIT="llava_gqa_testdev_balanced"
+GQADIR="/root/autodl-tmp/data/eval/gqa/"
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+VERSION=type-3
+
+MODEL_PATH="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_BASE="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+#MODEL_BASE=$LLM_VERSION
+MODEL_NAME="tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m tinyllava.eval.model_vqa_loader \
+        --model-path $MODEL_PATH \
+        --question-file $EVAL_DIR/gqa/$SPLIT.jsonl \
+        --image-folder $EVAL_DIR/gqa/images \
+        --answers-file $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \
+        --num-chunks $CHUNKS \
+        --chunk-idx $IDX \
+        --temperature 0 \
+        --model-base $MODEL_BASE\
+        --conv-mode v1 &
+done
+
+wait
+
+output_file=$EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/merge.jsonl
+
+# Clear out the output file if it exists.
+> "$output_file"
+
+# Loop through the indices and concatenate each file.
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    cat $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
+done
+
+python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
+
+cd $GQADIR
+python eval/eval.py --tier testdev_balanced
diff --git a/scripts/tiny_llava/eval/gqa_v1.sh b/scripts/tiny_llava/eval/gqa_v1.sh
new file mode 100644
index 0000000..3425efc
--- /dev/null
+++ b/scripts/tiny_llava/eval/gqa_v1.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+CHUNKS=${#GPULIST[@]}
+
+SPLIT="llava_gqa_testdev_balanced"
+GQADIR="/root/autodl-tmp/data/eval/gqa/"
+
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
+        --model-path $MODEL_PATH \
+        --question-file $EVAL_DIR/gqa/$SPLIT.jsonl \
+        --image-folder $EVAL_DIR/gqa/images \
+        --answers-file $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \
+        --num-chunks $CHUNKS \
+        --chunk-idx $IDX \
+        --temperature 0 \
+        --conv-mode v1 &
+done
+
+wait
+
+output_file=$EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/merge.jsonl
+
+# Clear out the output file if it exists.
+> "$output_file"
+
+# Loop through the indices and concatenate each file.
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    cat $EVAL_DIR/gqa/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
+done
+
+python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
+
+cd $GQADIR
+python eval/eval.py --tier testdev_balanced
diff --git a/scripts/tiny_llava/eval/imagenet.sh b/scripts/tiny_llava/eval/imagenet.sh
new file mode 100644
index 0000000..bc99781
--- /dev/null
+++ b/scripts/tiny_llava/eval/imagenet.sh
@@ -0,0 +1,25 @@
+torchrun --nnodes=1 \
+--standalone \
+--nproc-per-node=8 \
+tinyllava/eval/eval_clip_imagenet.py \
+--linear_probe True \
+--pretrained_path ./checkpoints/tiny-llava-sharegpt4v-unlock-vit-from-12-tune-entire-model-TinyLlama-1.1B-Chat-v1.0-clip-vit-large-patch14-336-pretrain/vision_tower \
+--train_data_path /mnt/data/sata/winci/datasets/ImageNet/train \
+--eval_data_path /mnt/data/sata/winci/datasets/ImageNet/val \
+--learning_rate 1e-2 \
+--num_train_epochs 5 \
+--per_device_train_batch_size 64 \
+--per_device_eval_batch_size 64 \
+--logging_strategy steps \
+--logging_steps 1 \
+--evaluation_strategy epoch \
+--save_strategy epoch \
+--load_best_model_at_end True \
+--save_total_limit 1 \
+--seed 42 \
+--do_train \
+--do_eval \
+--optim sgd \
+--output_dir ./checkpoints/eval_imagenet/ \
+--bf16 True \
+--logging_steps 1 \
diff --git a/scripts/tiny_llava/eval/mmbench.sh b/scripts/tiny_llava/eval/mmbench.sh
new file mode 100644
index 0000000..3193a1b
--- /dev/null
+++ b/scripts/tiny_llava/eval/mmbench.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+SPLIT="mmbench_dev_20230712"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-clip-vit-large-patch14-336-finetune"
+MODEL_NAME="tiny-llava-v1-1.1b"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m tinyllava.eval.model_vqa_mmbench \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/mmbench/$SPLIT.tsv \
+    --answers-file $EVAL_DIR/mmbench/answers/$SPLIT/$MODEL_NAME.jsonl \
+    --single-pred-prompt \
+    --temperature 0 \
+    --conv-mode vicuna_v1
+
+mkdir -p $EVAL_DIR/mmbench/answers_upload/$SPLIT
+
+python scripts/convert_mmbench_for_submission.py \
+    --annotation-file $EVAL_DIR/mmbench/$SPLIT.tsv \
+    --result-dir $EVAL_DIR/mmbench/answers/$SPLIT \
+    --upload-dir $EVAL_DIR/mmbench/answers_upload/$SPLIT \
+    --experiment $MODEL_NAME
diff --git a/scripts/tiny_llava/eval/mmbench_cn.sh b/scripts/tiny_llava/eval/mmbench_cn.sh
new file mode 100644
index 0000000..acc2fe4
--- /dev/null
+++ b/scripts/tiny_llava/eval/mmbench_cn.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+SPLIT="mmbench_dev_cn_20231003"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-clip-vit-large-patch14-336-finetune"
+MODEL_NAME="tiny-llava-v1-1.1b"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m tinyllava.eval.model_vqa_mmbench \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/mmbench_cn/$SPLIT.tsv \
+    --answers-file $EVAL_DIR/mmbench_cn/answers/$SPLIT/$MODEL_NAME.jsonl \
+    --lang cn \
+    --single-pred-prompt \
+    --temperature 0 \
+    --conv-mode vicuna_v1
+
+mkdir -p $EVAL_DIR/mmbench/answers_upload/$SPLIT
+
+python scripts/convert_mmbench_for_submission.py \
+    --annotation-file $EVAL_DIR/mmbench_cn/$SPLIT.tsv \
+    --result-dir $EVAL_DIR/mmbench_cn/answers/$SPLIT \
+    --upload-dir $EVAL_DIR/mmbench_cn/answers_upload/$SPLIT \
+    --experiment $MODEL_NAME
diff --git a/scripts/tiny_llava/eval/mme.sh b/scripts/tiny_llava/eval/mme.sh
new file mode 100644
index 0000000..2f160e3
--- /dev/null
+++ b/scripts/tiny_llava/eval/mme.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+VERSION="$1"
+LLM_VERSION="$2"
+LLM_VARIANT="${LLM_VERSION#*/}"
+VT_VERSION=google/siglip-so400m-patch14-384
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune"
+
+EVAL_DIR="/mnt/data/sata/ssd/dataset/eval"
+
+python -m tinyllava.eval.model_vqa_loader \
+    --model-path liuhaotian/llava-v1.5-13b \
+    --question-file ./playground/data/eval/MME/llava_mme.jsonl \
+    --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
+    --answers-file ./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl \
+    --temperature 0 \
+    --conv-mode vicuna_v1
+
+cd ./playground/data/eval/MME
+
+python convert_answer_to_mme.py --experiment llava-v1.5-13b
+
+cd eval_tool
+
+python calculation.py --results_dir answers/llava-v1.5-13b
diff --git a/scripts/tiny_llava/eval/mmvet.sh b/scripts/tiny_llava/eval/mmvet.sh
new file mode 100644
index 0000000..6b90127
--- /dev/null
+++ b/scripts/tiny_llava/eval/mmvet.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-clip-vit-large-patch14-336-finetune"
+MODEL_NAME="tiny-llava-v1-1.1b"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m tinyllava.eval.model_vqa \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/mm-vet/tinyllava-mm-vet.jsonl \
+    --image-folder $EVAL_DIR/mm-vet/images \
+    --answers-file $EVAL_DIR/mm-vet/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --conv-mode vicuna_v1
+
+mkdir -p $MODEL_PATH/mm-vet/results
+
+python scripts/convert_mmvet_for_eval.py \
+    --src $EVAL_DIR/mm-vet/answers/$MODEL_NAME.jsonl \
+    --dst $EVAL_DIR/mm-vet/results/$MODEL_NAME.json
+
diff --git a/scripts/tiny_llava/eval/pope.sh b/scripts/tiny_llava/eval/pope.sh
new file mode 100644
index 0000000..5e57d01
--- /dev/null
+++ b/scripts/tiny_llava/eval/pope.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m tinyllava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \
+    --image-folder $EVAL_DIR/pope/val2014 \
+    --answers-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --conv-mode tiny_llama
+
+python tinyllava/eval/eval_pope.py \
+    --annotation-dir $EVAL_DIR/pope/coco \
+    --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \
+    --result-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl
diff --git a/scripts/tiny_llava/eval/pope_v1.sh b/scripts/tiny_llava/eval/pope_v1.sh
new file mode 100644
index 0000000..f45a3e6
--- /dev/null
+++ b/scripts/tiny_llava/eval/pope_v1.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-base-patch16
+VT_VARIANT="${VT_VERSION#*/}"
+
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \
+    --image-folder $EVAL_DIR/pope/val2014 \
+    --answers-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --conv-mode v1
+
+python llava/eval/eval_pope.py \
+    --annotation-dir $EVAL_DIR/pope/coco \
+    --question-file $EVAL_DIR/pope/llava_pope_test.jsonl \
+    --result-file $EVAL_DIR/pope/answers/$MODEL_NAME.jsonl
diff --git a/scripts/tiny_llava/eval/sqa.sh b/scripts/tiny_llava/eval/sqa.sh
new file mode 100644
index 0000000..c8fbea4
--- /dev/null
+++ b/scripts/tiny_llava/eval/sqa.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+VERSION=type-3
+
+MODEL_PATH="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_BASE="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+#MODEL_BASE=$LLM_VERSION
+MODEL_NAME="tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m tinyllava.eval.model_vqa_science \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/scienceqa/llava_test_CQM-A.json \
+    --image-folder $EVAL_DIR/scienceqa/images/test \
+    --answers-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \
+    --single-pred-prompt \
+    --temperature 0 \
+    --model-base $MODEL_BASE \
+    --conv-mode v1
+
+python tinyllava/eval/eval_science_qa.py \
+    --base-dir $EVAL_DIR/scienceqa \
+    --result-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \
+    --output-file $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_output.jsonl \
+    --output-result $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_result.json
diff --git a/scripts/tiny_llava/eval/sqa_v1.sh b/scripts/tiny_llava/eval/sqa_v1.sh
new file mode 100644
index 0000000..ca329b9
--- /dev/null
+++ b/scripts/tiny_llava/eval/sqa_v1.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-base-patch16
+VT_VARIANT="${VT_VERSION#*/}"
+
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_science \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/scienceqa/llava_test_CQM-A.json \
+    --image-folder $EVAL_DIR/scienceqa/images/test \
+    --answers-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \
+    --single-pred-prompt \
+    --temperature 0 \
+    --conv-mode v1
+
+python llava/eval/eval_science_qa.py \
+    --base-dir $EVAL_DIR/scienceqa \
+    --result-file $EVAL_DIR/scienceqa/answers/$MODEL_NAME.jsonl \
+    --output-file $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_output.jsonl \
+    --output-result $EVAL_DIR/scienceqa/answers/"$MODEL_NAME"_result.json
+
diff --git a/scripts/tiny_llava/eval/textvqa.sh b/scripts/tiny_llava/eval/textvqa.sh
new file mode 100644
index 0000000..df50b72
--- /dev/null
+++ b/scripts/tiny_llava/eval/textvqa.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+#MODEL_PATH="./checkpoints/tiny-tinyllava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+#MODEL_NAME="tiny-tinyllava-v1.5-${LLM_VARIANT}-${VT_VARIANT}"
+#EVAL_DIR="/root/autodl-tmp/data/eval"
+VERSION=type-3
+MODEL_PATH="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_BASE="./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+#MODEL_BASE=$LLM_VERSION
+MODEL_NAME="tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m tinyllava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/textvqa/llava_textvqa_val_v051_ocr.jsonl \
+    --image-folder $EVAL_DIR/textvqa/train_images \
+    --answers-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --model-base $MODEL_BASE \
+    --conv-mode v1
+
+python -m tinyllava.eval.eval_textvqa \
+    --annotation-file $EVAL_DIR/textvqa/TextVQA_0.5.1_val.json \
+    --result-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl
diff --git a/scripts/tiny_llava/eval/textvqa_v1.sh b/scripts/tiny_llava/eval/textvqa_v1.sh
new file mode 100644
index 0000000..9aa79c6
--- /dev/null
+++ b/scripts/tiny_llava/eval/textvqa_v1.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}"
+
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/textvqa/llava_textvqa_val_v051_ocr.jsonl \
+    --image-folder $EVAL_DIR/textvqa/train_images \
+    --answers-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --conv-mode v1
+
+python -m llava.eval.eval_textvqa \
+    --annotation-file $EVAL_DIR/textvqa/TextVQA_0.5.1_val.json \
+    --result-file $EVAL_DIR/textvqa/answers/$MODEL_NAME.jsonl
diff --git a/scripts/tiny_llava/eval/vizwiz.sh b/scripts/tiny_llava/eval/vizwiz.sh
new file mode 100644
index 0000000..46bc176
--- /dev/null
+++ b/scripts/tiny_llava/eval/vizwiz.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1b-sharegpt4v"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m tinyllava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/vizwiz/llava_test.jsonl \
+    --image-folder $EVAL_DIR/vizwiz/test \
+    --answers-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --conv-mode vicuna_v1
+
+python scripts/convert_vizwiz_for_submission.py \
+    --annotation-file $EVAL_DIR/vizwiz/llava_test.jsonl \
+    --result-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \
+    --result-upload-file $EVAL_DIR/vizwiz/answers_upload/$MODEL_NAME.json
diff --git a/scripts/tiny_llava/eval/vizwiz_v1.sh b/scripts/tiny_llava/eval/vizwiz_v1.sh
new file mode 100644
index 0000000..f7b075a
--- /dev/null
+++ b/scripts/tiny_llava/eval/vizwiz_v1.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+# MODEL_PATH="./checkpoints/tiny-llava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune"
+# MODEL_NAME="tiny-llava-v1-1.1b-sharegpt4v"
+# MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+MODEL_PATH="./checkpoints/tiny-llava-v1-1.1B-${VT_VARIANT}-finetune"
+MODEL_NAME="tiny-llava-v1-1.1B-${VT_VARIANT}"
+
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+python -m llava.eval.model_vqa_loader \
+    --model-path $MODEL_PATH \
+    --question-file $EVAL_DIR/vizwiz/llava_test.jsonl \
+    --image-folder $EVAL_DIR/vizwiz/test \
+    --answers-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \
+    --temperature 0 \
+    --conv-mode v1
+
+python scripts/convert_vizwiz_for_submission.py \
+    --annotation-file $EVAL_DIR/vizwiz/llava_test.jsonl \
+    --result-file $EVAL_DIR/vizwiz/answers/$MODEL_NAME.jsonl \
+    --result-upload-file $EVAL_DIR/vizwiz/answers_upload/$MODEL_NAME.json
diff --git a/scripts/tiny_llava/eval/vqav2.sh b/scripts/tiny_llava/eval/vqav2.sh
new file mode 100644
index 0000000..4d076c4
--- /dev/null
+++ b/scripts/tiny_llava/eval/vqav2.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+CHUNKS=${#GPULIST[@]}
+
+SPLIT="llava_vqav2_mscoco_test-dev2015"
+
+#LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+#LLM_VARIANT="${LLM_VERSION#*/}"
+#
+#VT_VERSION=openai/clip-vit-large-patch14-336
+#VT_VARIANT="${VT_VERSION#*/}"
+#
+#MODEL_PATH="./checkpoints/tiny-tinyllava-v1-${LLM_VARIANT}-${VT_VARIANT}-finetune"
+#MODEL_NAME="tiny-tinyllava-v1-1.1b-sharegpt4v"
+#EVAL_DIR="/root/autodl-tmp/data/eval"
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+VT_VERSION=openai/clip-vit-large-patch14-336
+VT_VARIANT="${VT_VERSION#*/}"
+
+MODEL_PATH="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora"
+MODEL_BASE="./checkpoints/tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}-pretrain"
+MODEL_NAME="tiny-llava-v1.5-${LLM_VARIANT}-${VT_VARIANT}"
+EVAL_DIR="/root/autodl-tmp/data/eval"
+
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m tinyllava.eval.model_vqa_loader \
+        --model-path $MODEL_PATH \
+        --question-file $EVAL_DIR/vqav2/$SPLIT.jsonl \
+        --image-folder $EVAL_DIR/vqav2/test2015 \
+        --answers-file $EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl \
+        --num-chunks $CHUNKS \
+        --chunk-idx $IDX \
+        --temperature 0 \
+        --conv-mode vicuna_v1 &
+done
+
+wait
+
+output_file=$EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/merge.jsonl
+
+# Clear out the output file if it exists.
+> "$output_file"
+
+# Loop through the indices and concatenate each file.
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    cat $EVAL_DIR/vqav2/answers/$SPLIT/$MODEL_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
+done
+
+python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $MODEL_NAME --dir $EVAL_DIR/vqav2
+
diff --git a/scripts/tiny_llava/finetune.sh b/scripts/tiny_llava/finetune.sh
new file mode 100644
index 0000000..ceca30a
--- /dev/null
+++ b/scripts/tiny_llava/finetune.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+if [ $# -ne 5 ]; then
+    echo "Usage: $0 <LLM_VERSION> <VT_VERSION> <IMAGE_PATH> <DATA_PATH> <VERSION>"
+    exit 1
+fi
+
+# Assign the arguments to variables
+LLM_VERSION="$1"
+VT_VERSION="$2"
+DATA_PATH="$3"
+IMAGE_PATH="$4"
+VERSION="$5"
+
+# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+# VT_VERSION=openai/clip-vit-base-patch16
+# DATA_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data/text_files/llava_v1_5_mix665k.json
+# IMAGE_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data
+VT_VARIANT="${VT_VERSION#*/}"
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path $LLM_VERSION \
+    --version v1 \
+    --data_path $DATA_PATH \
+    --image_folder $IMAGE_PATH\
+    --vision_tower $VT_VERSION \
+    --pretrain_mm_mlp_adapter ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain/mm_projector.bin \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --image_aspect_ratio pad \
+    --group_by_modality_length True \
+    --fp16 True \
+    --output_dir ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 30 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name llava-${VERSION}-finetune-${LLM_VARIANT}-${VT_VARIANT}
diff --git a/scripts/tiny_llava/finetune_lora.sh b/scripts/tiny_llava/finetune_lora.sh
new file mode 100644
index 0000000..9b3f9e6
--- /dev/null
+++ b/scripts/tiny_llava/finetune_lora.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+if [ $# -ne 5 ]; then
+    echo "Usage: $0 <LLM_VERSION> <VT_VERSION> <IMAGE_PATH> <DATA_PATH> <VERSION>"
+    exit 1
+fi
+
+# Assign the arguments to variables
+LLM_VERSION="$1"
+VT_VERSION="$2"
+DATA_PATH="$3"
+IMAGE_PATH="$4"
+VERSION="$5"
+
+VT_VARIANT="${VT_VERSION#*/}"
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+#    --model_name_or_path ./checkpoints/tiny-tinyllava-type-2-${LLM_VARIANT}-${VT_VARIANT}-pretrain
+deepspeed llava/train/train_mem.py \
+    --lora_enable True --lora_r 128 --lora_alpha 256 \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path $LLM_VERSION \
+    --pretrain_mm_mlp_adapter ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain/mm_projector.bin \
+    --version  v1 \
+    --data_path $DATA_PATH \
+    --image_folder $IMAGE_PATH \
+    --vision_tower $VT_VERSION \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --image_aspect_ratio pad \
+    --group_by_modality_length True \
+    --fp16 True \
+    --output_dir ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 30 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name tiny-llava-${VERSION}-finetune-lora-${LLM_VARIANT}-${VT_VARIANT}
diff --git a/scripts/tiny_llava/finetune_lora_llm_open.sh b/scripts/tiny_llava/finetune_lora_llm_open.sh
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/tiny_llava/finetune_lora_type3.sh b/scripts/tiny_llava/finetune_lora_type3.sh
new file mode 100644
index 0000000..410f5a1
--- /dev/null
+++ b/scripts/tiny_llava/finetune_lora_type3.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+if [ $# -ne 5 ]; then
+    echo "Usage: $0 <LLM_VERSION> <VT_VERSION> <IMAGE_PATH> <DATA_PATH> <VERSION>"
+    exit 1
+fi
+
+# Assign the arguments to variables
+LLM_VERSION="$1"
+VT_VERSION="$2"
+DATA_PATH="$3"
+IMAGE_PATH="$4"
+VERSION="$5"
+
+VT_VARIANT="${VT_VERSION#*/}"
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+#    --model_name_or_path ./checkpoints/tiny-tinyllava-type-2-${LLM_VARIANT}-${VT_VARIANT}-pretrain
+deepspeed llava/train/train_mem.py \
+    --lora_enable True --lora_r 128 --lora_alpha 256 \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain \
+    --version  v1 \
+    --data_path $DATA_PATH \
+    --image_folder $IMAGE_PATH \
+    --vision_tower $VT_VERSION \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --image_aspect_ratio pad \
+    --group_by_modality_length True \
+    --fp16 True \
+    --output_dir ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune-lora \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 30 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name tiny-llava-${VERSION}-finetune-lora-${LLM_VARIANT}-${VT_VARIANT}
diff --git a/scripts/tiny_llava/finetune_resamplers.sh b/scripts/tiny_llava/finetune_resamplers.sh
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/tiny_llava/finetune_type4.sh b/scripts/tiny_llava/finetune_type4.sh
new file mode 100644
index 0000000..83be89a
--- /dev/null
+++ b/scripts/tiny_llava/finetune_type4.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+if [ $# -ne 5 ]; then
+    echo "Usage: $0 <LLM_VERSION> <VT_VERSION> <IMAGE_PATH> <DATA_PATH> <VERSION>"
+    exit 1
+fi
+
+# Assign the arguments to variables
+LLM_VERSION="$1"
+VT_VERSION="$2"
+DATA_PATH="$3"
+IMAGE_PATH="$4"
+VERSION="$5"
+
+# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+# VT_VERSION=openai/clip-vit-base-patch16
+# DATA_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data/text_files/llava_v1_5_mix665k.json
+# IMAGE_PATH=/root/autodl-tmp/data/pretraining_data/LLaVA-Data
+VT_VARIANT="${VT_VERSION#*/}"
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-pretrain \
+    --version v1 \
+    --data_path $DATA_PATH \
+    --image_folder $IMAGE_PATH\
+    --vision_tower $VT_VERSION \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --image_aspect_ratio pad \
+    --group_by_modality_length True \
+    --fp16 True \
+    --output_dir ./checkpoints/tiny-llava-${VERSION}-${LLM_VARIANT}-${VT_VARIANT}-finetune \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 30 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name llava-${VERSION}-finetune-${LLM_VARIANT}-${VT_VARIANT}
diff --git a/scripts/tiny_llava/pretrain.sh b/scripts/tiny_llava/pretrain.sh
new file mode 100755
index 0000000..23be997
--- /dev/null
+++ b/scripts/tiny_llava/pretrain.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+if [ $# -ne 5 ]; then
+    echo "Usage: $0 <LLM_VERSION> <VT_VERSION> <DATA_PATH> <IMAGE_PATH> <VERSION>"
+    exit 1
+fi
+
+# Assign the arguments to variables
+LLM_VERSION="$1"
+VT_VERSION="$2"
+DATA_PATH="$3"
+IMAGE_PATH="$4"
+VERSION="$5"
+
+# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+# VT_VERSION=openai/clip-vit-base-patch16
+# DATA_PATH=/root/autodl-tmp/data/tinyllava/blip_laion_cc_sbu_558k.json
+# IMAGE_PATH=/root/autodl-tmp/data/tinyllava/llava_pretrain/images
+VT_VARIANT="${VT_VERSION#*/}"
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path $LLM_VERSION \
+    --version plain \
+    --data_path  $DATA_PATH\
+    --image_folder $IMAGE_PATH \
+    --vision_tower $VT_VERSION \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --tune_mm_mlp_adapter True \
+    --fp16 True \
+    --output_dir ./checkpoints/tiny-llava-"${VERSION}"-"${LLM_VARIANT}"-"${VT_VARIANT}"-pretrain \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 32 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 2400 \
+    --save_total_limit 1 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 30 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name tiny-llava-"${VERSION}"-pretrain-"${LLM_VARIANT}"-"${VT_VARIANT}"
diff --git a/scripts/tiny_llava/pretrain_baselines.sh b/scripts/tiny_llava/pretrain_baselines.sh
new file mode 100644
index 0000000..1bacde2
--- /dev/null
+++ b/scripts/tiny_llava/pretrain_baselines.sh
@@ -0,0 +1,9 @@
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+VT_VERSIONS=(openai/clip-vit-large-patch14-336 openai/clip-vit-large-patch14 openai/clip-vit-base-patch16 openai/clip-vit-large-patch32)
+DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json
+IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images
+
+
+for VT_VERSION in "${VT_VERSIONS[@]}"; do
+    bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH"
+done
diff --git a/scripts/tiny_llava/pretrain_llm_open.sh b/scripts/tiny_llava/pretrain_llm_open.sh
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/tiny_llava/pretrain_type3.sh b/scripts/tiny_llava/pretrain_type3.sh
new file mode 100644
index 0000000..df53ca8
--- /dev/null
+++ b/scripts/tiny_llava/pretrain_type3.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+if [ $# -ne 5 ]; then
+    echo "Usage: $0 <LLM_VERSION> <VT_VERSION> <DATA_PATH> <IMAGE_PATH> <VERSION>"
+    exit 1
+fi
+
+# Assign the arguments to variables
+LLM_VERSION="$1"
+VT_VERSION="$2"
+DATA_PATH="$3"
+IMAGE_PATH="$4"
+VERSION="$5"
+
+# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+# VT_VERSION=openai/clip-vit-base-patch16
+# DATA_PATH=/root/autodl-tmp/data/tinyllava/blip_laion_cc_sbu_558k.json
+# IMAGE_PATH=/root/autodl-tmp/data/tinyllava/llava_pretrain/images
+VT_VARIANT="${VT_VERSION#*/}"
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path $LLM_VERSION \
+    --version plain \
+    --data_path  $DATA_PATH\
+    --image_folder $IMAGE_PATH \
+    --vision_tower $VT_VERSION \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --fp16 True \
+    --output_dir ./checkpoints/tiny-llava-"${VERSION}"-"${LLM_VARIANT}"-"${VT_VARIANT}"-pretrain \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 32 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 2400 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 30 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name tiny-llava-"${VERSION}"-pretrain-"${LLM_VARIANT}"-"${VT_VARIANT}"
diff --git a/scripts/tiny_llava/pretrain_type4.sh b/scripts/tiny_llava/pretrain_type4.sh
new file mode 100644
index 0000000..df53ca8
--- /dev/null
+++ b/scripts/tiny_llava/pretrain_type4.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+if [ $# -ne 5 ]; then
+    echo "Usage: $0 <LLM_VERSION> <VT_VERSION> <DATA_PATH> <IMAGE_PATH> <VERSION>"
+    exit 1
+fi
+
+# Assign the arguments to variables
+LLM_VERSION="$1"
+VT_VERSION="$2"
+DATA_PATH="$3"
+IMAGE_PATH="$4"
+VERSION="$5"
+
+# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+# VT_VERSION=openai/clip-vit-base-patch16
+# DATA_PATH=/root/autodl-tmp/data/tinyllava/blip_laion_cc_sbu_558k.json
+# IMAGE_PATH=/root/autodl-tmp/data/tinyllava/llava_pretrain/images
+VT_VARIANT="${VT_VERSION#*/}"
+LLM_VARIANT="${LLM_VERSION#*/}"
+
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path $LLM_VERSION \
+    --version plain \
+    --data_path  $DATA_PATH\
+    --image_folder $IMAGE_PATH \
+    --vision_tower $VT_VERSION \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --fp16 True \
+    --output_dir ./checkpoints/tiny-llava-"${VERSION}"-"${LLM_VARIANT}"-"${VT_VARIANT}"-pretrain \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 32 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 2400 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 30 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name tiny-llava-"${VERSION}"-pretrain-"${LLM_VARIANT}"-"${VT_VARIANT}"
diff --git a/scripts/tiny_llava/train_baselines.sh b/scripts/tiny_llava/train_baselines.sh
new file mode 100644
index 0000000..494730b
--- /dev/null
+++ b/scripts/tiny_llava/train_baselines.sh
@@ -0,0 +1,31 @@
+#LLM_VERSION=susnato/phi-1_5_dev
+LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+#VT_VERSIONS=(openai/clip-vit-large-patch14-336 openai/clip-vit-large-patch14 openai/clip-vit-base-patch16 openai/clip-vit-base-patch32)
+VT_VERSIONS=(openai/clip-vit-large-patch14-336)
+#DATA_PATH=/root/autodl-tmp/data/text_files/really_cleaned_share-captioner_coco_lcs_sam_1246k_1107.json
+DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json
+#FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/cleaned_sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json
+FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/llava_v1_5_mix665k.json
+#IMAGE_PATH=/root/autodl-tmp/data/
+#FINETUNE_IMAGE_PATH=/root/autodl-tmp/data
+IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images
+FINETUNE_IMAGE_PATH=/root/autodl-tmp/data
+
+## type 2 training
+#for VT_VERSION in "${VT_VERSIONS[@]}"; do
+##     bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" "type-2"
+#    bash scripts/tiny_llava/finetune_lora.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" type-2
+#done
+#
+#
+## type 3 training
+#for VT_VERSION in "${VT_VERSIONS[@]}"; do
+#     bash scripts/tiny_llava/pretrain_type3.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" type-3
+#    bash scripts/tiny_llava/finetune_lora_type3.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" type-3
+#done
+
+# type-4 training
+for VT_VERSION in "${VT_VERSIONS[@]}"; do
+     bash scripts/tiny_llava/pretrain_type4.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH" type-4
+    bash scripts/tiny_llava/finetune_type4.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH" type-4
+done
diff --git a/scripts/tiny_llava/train_baselines_phi_jia.sh b/scripts/tiny_llava/train_baselines_phi_jia.sh
new file mode 100644
index 0000000..ee1973a
--- /dev/null
+++ b/scripts/tiny_llava/train_baselines_phi_jia.sh
@@ -0,0 +1,18 @@
+LLM_VERSION=susnato/phi-1_5_dev
+# LLM_VERSION=TinyLlama/TinyLlama-1.1B-Chat-v1.0
+VT_VERSIONS=(openai/clip-vit-large-patch14-336)
+#DATA_PATH=/root/autodl-tmp/data/text_files/really_cleaned_share-captioner_coco_lcs_sam_1246k_1107.json
+DATA_PATH=/root/autodl-tmp/data/llava/blip_laion_cc_sbu_558k.json
+#FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/cleaned_sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json
+FINETUNE_DATA_PATH=/root/autodl-tmp/data/text_files/llava_v1_5_mix665k.json
+#IMAGE_PATH=/root/autodl-tmp/data/
+#FINETUNE_IMAGE_PATH=/root/autodl-tmp/data
+IMAGE_PATH=/root/autodl-tmp/data/llava/llava_pretrain/images
+FINETUNE_IMAGE_PATH=/root/autodl-tmp/data
+
+
+for VT_VERSION in "${VT_VERSIONS[@]}"; do
+     # bash scripts/tiny_llava/pretrain.sh "$LLM_VERSION" "$VT_VERSION" "$DATA_PATH" "$IMAGE_PATH"
+    bash scripts/tiny_llava/finetune_lora.sh "$LLM_VERSION" "$VT_VERSION" "$FINETUNE_DATA_PATH" "$FINETUNE_IMAGE_PATH"
+done
+
diff --git a/scripts/zero2.json b/scripts/zero2.json
new file mode 100644
index 0000000..7a01fda
--- /dev/null
+++ b/scripts/zero2.json
@@ -0,0 +1,23 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
\ No newline at end of file
diff --git a/scripts/zero3.json b/scripts/zero3.json
new file mode 100644
index 0000000..8ff461f
--- /dev/null
+++ b/scripts/zero3.json
@@ -0,0 +1,28 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
\ No newline at end of file
diff --git a/scripts/zero3_offload.json b/scripts/zero3_offload.json
new file mode 100644
index 0000000..2dcde84
--- /dev/null
+++ b/scripts/zero3_offload.json
@@ -0,0 +1,56 @@
+{
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto"
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "gather_16bit_weights_on_model_save": true
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "steps_per_print": 1e5,
+  "wall_clock_breakdown": false
+}