kangreen0210
diff --git a/‎lmms_eval/__main__.py
Lines changed: 38 additions & 34 deletions b/‎lmms_eval/__main__.py
Lines changed: 38 additions & 34 deletions
diff --git a/‎lmms_eval/models/__init__.py
Lines changed: 1 addition & 0 deletions b/‎lmms_eval/models/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmms_eval/models/fuyu.py
Lines changed: 137 additions & 0 deletions b/‎lmms_eval/models/fuyu.py
Lines changed: 137 additions & 0 deletions
diff --git a/‎lmms_eval/models/otterhd.py
Lines changed: 27 additions & 12 deletions b/‎lmms_eval/models/otterhd.py
Lines changed: 27 additions & 12 deletions
@@ -119,6 +119,11 @@ def parse_eval_args() -> argparse.Namespace:
         default="",
         help="Comma separated string arguments passed to wandb.init, e.g. `project=lmms-eval,job_type=eval",
     )
+    parser.add_argument(
+        "--timezone",
+        default="Asia/Singapore",
+        help="Timezone for datetime string, e.g. Asia/Singapore, America/New_York, America/Los_Angeles",
+    )
     args = parser.parse_args()
     return args
 
@@ -206,27 +211,11 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
             )
             # eval_logger.warn(f"Tasks {missing} were not found. Try `lmms-eval --tasks list` for list of available tasks.")
 
-    if args.output_path:
-        hash_input = f"{args.model_args}_{args.tasks}".encode("utf-8")
-        hash_output = hashlib.sha256(hash_input).hexdigest()[:6]
-        datetime_str = utils.get_datetime_str()
-        path = Path(args.output_path)
-        path = path.expanduser().resolve().joinpath(f"{args.model}_{datetime_str}_{hash_output}_{args.log_samples_suffix}")
-        # check if file or 'dir/results.json' exists
-        if path.is_file() or path.joinpath("results.json").is_file():
-            eval_logger.warning(f"File already exists at {path}. Results will be overwritten.")
-            output_path_file = path.joinpath("results.json")
-            assert not path.is_file(), "File already exists"
-        # if path json then get parent dir
-        elif path.suffix in (".json", ".jsonl"):
-            output_path_file = path
-        else:
-            output_path_file = path.joinpath("results.json")
-    elif args.log_samples and not args.output_path:
-        assert args.output_path, "Specify --output_path"
-
     eval_logger.info(f"Selected Tasks: {task_names}")
 
+    # set datetime before evaluation
+    datetime_str = utils.get_datetime_str(timezone=args.timezone)
+
     results = evaluator.simple_evaluate(
         model=args.model,
         model_args=args.model_args,
@@ -241,8 +230,22 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
         gen_kwargs=args.gen_kwargs,
     )
 
-    if results is not None:
+    if args.output_path:
+        hash_input = f"{args.model_args}".encode("utf-8")
+        hash_output = hashlib.sha256(hash_input).hexdigest()[:6]
+        path = Path(args.output_path)
+        path = path.expanduser().resolve().joinpath(f"{args.model}").joinpath(f"model_args_{hash_output}").joinpath(f"{datetime_str}")
         path.mkdir(parents=True, exist_ok=True)
+        assert path.is_dir(), f"Output path {path} is not a directory"
+
+        output_path_file = path.joinpath("results.json")
+        if output_path_file.exists():
+            eval_logger.warning(f"Output file {output_path_file} already exists and will be overwritten.")
+
+    elif args.log_samples and not args.output_path:
+        assert args.output_path, "Specify --output_path"
+
+    if results is not None:
         if args.log_samples:
             samples = results.pop("samples")
         dumped = json.dumps(results, indent=4, default=_handle_non_serializable)
@@ -254,7 +257,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
 
             if args.log_samples:
                 for task_name, config in results["configs"].items():
-                    output_name = f"{args.model}_{task_name}_{args.log_samples_suffix}"
+                    output_name = f"{task_name}_{args.log_samples_suffix}"
                     filename = path.joinpath(f"{output_name}.json")
                     # Structure the data with 'args' and 'logs' keys
                     data_to_dump = {"args": vars(args), "config": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"])}  # Convert Namespace to dict
@@ -293,24 +296,25 @@ def print_results(args, results):
     # initialize Accelerator
     accelerator = Accelerator()
     all_args_dict = vars(args)
+    wandb_run = None
 
     if accelerator.is_main_process:
         # initialize a W&B run only on rank 0
         wandb_args_dict = utils.simple_parse_args_string(args.wandb_args)
-        if "name" not in wandb_args_dict:
-            if "config" not in all_args_dict:
-                # use the model name and task names as run name
-                task_names = args.tasks.replace(",", "_")
-                wandb_args_dict["name"] = f"{args.model}_{task_names}_{args.log_samples_suffix}"
-                if args.num_fewshot:
-                    wandb_args_dict["name"] += f"_{args.num_fewshot}shot"
-            else:
-                # use the name of the config file as run name
-                wandb_args_dict["name"] = all_args_dict["config"].split("/")[-1].split(".")[0]
-        wandb_run = wandb.init(**wandb_args_dict)
+        if wandb_args_dict:
+            if "name" not in wandb_args_dict:
+                if "config" not in all_args_dict:
+                    # use the model name and task names as run name
+                    task_names = args.tasks.replace(",", "_")
+                    wandb_args_dict["name"] = f"{args.model}_{task_names}_{args.log_samples_suffix}"
+                    if args.num_fewshot:
+                        wandb_args_dict["name"] += f"_{args.num_fewshot}shot"
+                else:
+                    # use the name of the config file as run name
+                    wandb_args_dict["name"] = all_args_dict["config"].split("/")[-1].split(".")[0]
+            wandb_run = wandb.init(**wandb_args_dict)
         is_main_process = True
     else:
-        wandb_run = None
         is_main_process = False
 
     # run each config
@@ -319,5 +323,5 @@ def print_results(args, results):
         results = cli_evaluate(args, wandb_run)
         results_list.append(results)
 
-    if is_main_process:
+    if is_main_process and wandb_run is not None:
         wandb_run.finish()
@@ -1,3 +1,4 @@
 from .llava import Llava
 from .otterhd import OtterHD
 from .qwen_vl import Qwen_VL
+from .fuyu import Fuyu
@@ -0,0 +1,137 @@
+from transformers import FuyuForCausalLM, AutoTokenizer, FuyuImageProcessor, FuyuProcessor
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+import torch
+from PIL import Image
+from typing import List, Optional, Union, Tuple
+from lmms_eval import utils
+from lmms_eval.api.instance import Instance
+from tqdm import tqdm
+
+
+@register_model("fuyu")
+class Fuyu(lmms):
+    """
+    Fuyu Model
+    """
+
+    def __init__(
+        self,
+        pretrained: str = "adept/fuyu-8b",
+        device: Optional[str] = "cuda",
+        max_new_tokens: int = 256,
+        batch_size: Optional[Union[int, str]] = 1,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # Do not use kwargs for now
+        assert kwargs == {}, f"Unexpected kwargs: {kwargs}"
+
+        self.device = device if torch.cuda.is_available() else "cpu"
+        self.model = FuyuForCausalLM.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device)
+        self.model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained)
+        self.image_processor = FuyuImageProcessor()
+        self.processor = FuyuProcessor(image_processor=self.image_processor, tokenizer=self.tokenizer)
+        self.max_new_tokens = max_new_tokens
+        self.batch_size_per_gpu = int(batch_size)
+
+    @property
+    def max_length(self):
+        # Assuming max_length is the sum of max context tokens and max new tokens
+        return self.tokenizer.model_max_length
+
+    # @property
+    # def max_gen_toks(self) -> int:
+    #     return self.max_new_tokens
+
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    def flatten(self, input, only_get_first=False):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+                if only_get_first:
+                    break
+        return new_list
+
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+
+        re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1
+        pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
+
+        for chunk in chunks:
+            contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
+            task = task[0]
+            split = split[0]
+            visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]
+            visuals = self.flatten(visuals, only_get_first=True)
+            gen_kwargs = all_gen_kwargs[0]
+
+            # if isinstance(visuals[0], list):
+            #     visuals = [visuals[idx][0] for idx in range(len(visuals))]  # get the first image in multi-image scenarios.
+
+            # assert len(contexts) == self.batch_size_per_gpu, f"Expected contexts batch size {self.batch_size_per_gpu}, got {len(contexts)}"
+            # assert len(visuals) == self.batch_size_per_gpu, f"Expected visuals batch size {self.batch_size_per_gpu}, got {len(visuals)}"
+            formatted_contexts = [f"{context}\n" for context in contexts]
+            model_inputs = self.processor(text=formatted_contexts, images=visuals, device=self.device)
+            for k, v in model_inputs.items():
+                model_inputs[k] = v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else [vv.to(self.device, non_blocking=True) for vv in v]
+
+            for index in range(len(model_inputs["image_patches"])):
+                model_inputs["image_patches"][index] = model_inputs["image_patches"][index].to(dtype=next(self.model.parameters()).dtype)
+
+            # preconfigure gen_kwargs with defaults
+            gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+            generation_output = self.model.generate(**model_inputs, max_new_tokens=gen_kwargs["max_new_tokens"], pad_token_id=self.tokenizer.eos_token_id)
+            generation_texts = self.processor.batch_decode(generation_output, skip_special_tokens=True)
+            response = [gen_text.split("\x04")[1].strip(" ").strip("\n") for gen_text in generation_texts]
+            res.extend(response)
+            pbar.update(1)
+
+        pbar.close()
+        return res
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        # TODO
+        assert False, "We have not implemented this function for llava yet"
+
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        # TODO
+        assert False, "We have not implemented this function for llava yet"
+
+    def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]:
+        """ """
+        add_special_tokens = False if add_special_tokens is None else add_special_tokens
+        encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+        return encoding
+
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
@@ -52,11 +52,13 @@ def max_length(self):
     def batch_size(self):
         return self.batch_size_per_gpu
 
-    def flatten(self, input):
+    def flatten(self, input, only_get_first=False):
         new_list = []
         for i in input:
             for j in i:
                 new_list.append(j)
+                if only_get_first:
+                    break
         return new_list
 
     def generate_until(self, requests: List[Instance]) -> List[str]:
@@ -72,31 +74,44 @@ def _collate(x):
             toks = self.tok_encode(x[0])
             return -len(toks), x[0]
 
-        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
         re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
         chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1
+        pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
 
         for chunk in chunks:
-            contexts, all_gen_kwargs, visuals = zip(*chunk)
-            visuals = self.flatten(visuals)
+            contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
+            task = task[0]
+            split = split[0]
+            visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]
+            visuals = self.flatten(visuals, only_get_first=True)
             gen_kwargs = all_gen_kwargs[0]
 
-            if isinstance(visuals, list):
-                visuals = [visuals[0]]
+            # if isinstance(visuals[0], list):
+            #     visuals = [visuals[idx][0] for idx in range(len(visuals))]  # get the first image in multi-image scenarios.
 
-            formatted_contexts = f"User: {contexts[0]} Assistant:"
+            formatted_contexts = [f"User: {context} Assistant:" for context in contexts]
             model_inputs = self.processor(text=[formatted_contexts], images=visuals, device=self.device)
             for k, v in model_inputs.items():
                 model_inputs[k] = v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else [vv.to(self.device, non_blocking=True) for vv in v]
 
             for index in range(len(model_inputs["image_patches"])):
                 model_inputs["image_patches"][index] = model_inputs["image_patches"][index].to(dtype=next(self.model.parameters()).dtype)
 
-            max_new_tokens = gen_kwargs.get("max_new_tokens", self.max_new_tokens)
-            generation_output = self.model.generate(**model_inputs, max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.eos_token_id)
-            generation_text = self.processor.batch_decode(generation_output, skip_special_tokens=True)
-            response = generation_text[0].split("\x04")[1].strip(" ").strip("\n")
-            res.append(response)
+            # preconfigure gen_kwargs with defaults
+            gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+            generation_output = self.model.generate(**model_inputs, max_new_tokens=gen_kwargs["max_new_tokens"], pad_token_id=self.tokenizer.eos_token_id)
+            generation_texts = self.processor.batch_decode(generation_output, skip_special_tokens=True)
+            response = [gen_text.split("\x04")[1].strip(" ").strip("\n") for gen_text in generation_texts]
+            res.extend(response)
             pbar.update(1)
 
         pbar.close()