Blaizzy · Blaizzy · Feb 1, 2025 · Oct 20, 2024 · Oct 22, 2024 · Oct 29, 2024
diff --git a/README.md b/README.md
@@ -112,6 +112,29 @@ print(output)
 python -m mlx_vlm.generate --model mlx-community/Qwen2-VL-2B-Instruct-4bit --max-tokens 100 --prompt "Compare these images" --image path/to/image1.jpg path/to/image2.jpg
 ```
 
+## Video Understanding
+
+MLX-VLM also supports video analysis such as captioning, summarization, and more, with select models.
+
+### Supported Models
+
+The following models support video chat:
+
+1. Qwen2-VL
+2. Qwen2.5-VL
+3. Idefics3
+4. LLaVA
+
+With more coming soon.
+
+### Usage Examples
+
+#### Command Line
+```sh
+python -m mlx_vlm.video_generate --model mlx-community/Qwen2-VL-2B-Instruct-4bit --max-tokens 100 --prompt "Describe this video" --video path/to/video.mp4 --max-pixels 224 224 --fps 1.0
+```
+
+
 These examples demonstrate how to use multiple images with MLX-VLM for more complex visual reasoning tasks.
 
 # Fine-tuning

diff --git a/examples/video_understanding.ipynb b/examples/video_understanding.ipynb
@@ -0,0 +1,204 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Video Understanding\n",
+    "\n",
+    "In this example, we will generate a description of a video using `Qwen2-VL`, `Qwen2-5-VL`, `LLava`, and `Idefics3`, with more models coming soon.\n",
+    "\n",
+    "This feature is currently in beta, may not work as expected.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Install Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -U mlx-vlm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/homebrew/Caskroom/miniconda/base/envs/mlx_code/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "This is a beta version of the video understanding. It may not work as expected.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pprint import pprint\n",
+    "from mlx_vlm import load\n",
+    "from mlx_vlm.utils import generate\n",
+    "from mlx_vlm.video_generate import process_vision_info\n",
+    "\n",
+    "import mlx.core as mx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the model and processor\n",
+    "model, processor = load(\"mlx-community/Qwen2.5-VL-7B-Instruct-4bit\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "numpy reader: video_path=videos/fastmlx_local_ai_hub.mp4, total_frames=1134, video_fps=59.941855343141576, time=0.000s\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Messages containing a video and a text query\n",
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            {\n",
+    "                \"type\": \"video\",\n",
+    "                \"video\": \"videos/fastmlx_local_ai_hub.mp4\",\n",
+    "                \"max_pixels\": 360 * 360,\n",
+    "                \"fps\": 1.0,\n",
+    "            },\n",
+    "            {\"type\": \"text\", \"text\": \"Describe this video.\"},\n",
+    "        ],\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "# Preparation for inference\n",
+    "text = processor.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "image_inputs, video_inputs = process_vision_info(messages)\n",
+    "inputs = processor(\n",
+    "    text=[text],\n",
+    "    images=image_inputs,\n",
+    "    videos=video_inputs,\n",
+    "    padding=True,\n",
+    "    return_tensors=\"pt\",\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert inputs to mlx arrays\n",
+    "input_ids = mx.array(inputs['input_ids'])\n",
+    "pixel_values = mx.array(inputs['pixel_values_videos'])\n",
+    "mask = mx.array(inputs['attention_mask'])\n",
+    "image_grid_thw = mx.array(inputs['video_grid_thw'])\n",
+    "\n",
+    "kwargs = {\n",
+    "    \"image_grid_thw\": image_grid_thw,\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kwargs[\"video\"] = \"videos/fastmlx_local_ai_hub.mp4\"\n",
+    "kwargs[\"input_ids\"] = input_ids\n",
+    "kwargs[\"pixel_values\"] = pixel_values\n",
+    "kwargs[\"mask\"] = mask\n",
+    "response = generate(model, processor, prompt=text, temp=0.7, max_tokens=100, **kwargs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('The video appears to be a live stream or a recording of a coding session, '\n",
+      " 'likely on a platform like Discord, as indicated by the presence of text '\n",
+      " \"chats and a streamer's interface. The video is primarily focused on a \"\n",
+      " 'computer screen displaying a code editor with various programming languages '\n",
+      " 'and snippets of code. The coder seems to be explaining or demonstrating '\n",
+      " 'something related to the code, possibly working through a programming '\n",
+      " 'problem, explaining the logic, or showing the process of solving a problem.\\n'\n",
+      " '\\n'\n",
+      " 'Here are some key observations from')\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint(response)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# open video and play it\n",
+    "from ipywidgets import Video\n",
+    "Video.from_file(\"videos/fastmlx_local_ai_hub.mp4\", width=320, height=240)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mlx_code",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/videos/fastmlx_local_ai_hub.mp4 b/examples/videos/fastmlx_local_ai_hub.mp4
diff --git a/mlx_vlm/__init__.py b/mlx_vlm/__init__.py
@@ -8,3 +8,4 @@
     quantize_model,
 )
 from .version import __version__
+from .video_generate import VideoFrameExtractor, process_vision_info