OpenGVLab
diff --git a/‎.flake8
Lines changed: 1 addition & 1 deletion b/‎.flake8
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 32 additions & 29 deletions b/‎README.md
Lines changed: 32 additions & 29 deletions
diff --git a/‎README_zh.md
Lines changed: 33 additions & 30 deletions b/‎README_zh.md
Lines changed: 33 additions & 30 deletions
diff --git a/‎internvl_chat/eval/caption/evaluate_caption.py
Lines changed: 2 additions & 3 deletions b/‎internvl_chat/eval/caption/evaluate_caption.py
Lines changed: 2 additions & 3 deletions
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E501, F403, C901, W504, W605, E251, E122, E126, E127, E722, W503, E128, E741
+ignore = E501, F403, C901, W504, W605, E251, E122, E126, E127, E722, W503, E128, E741, E731, E701
 select = E1, E3, E502, E7, E9, W1, W5, W6
 max-line-length = 180
 exclude=*.egg/*,build,dist,detection/configs/*
@@ -45,11 +45,14 @@
 
 ## TODO List
 
+- [x] Support liger kernels to save GPU memory
+- [x] Release the code, model, and data of MPO
+- [x] Support multimodal packed dataset
 - [ ] Support vLLM and Ollama
-- [x] Rebuild documents using readthedocs
-- [x] Support fine-tuning different LLMs with LoRA
 - [ ] Support video and PDF input in online demo
 - [ ] Release InternVL2 with VisionLLMv2 integration
+- [x] Rebuild documents using readthedocs
+- [x] Support fine-tuning different LLMs with LoRA
 - [x] Release `requirements.txt` for InternVL2
 - [x] Release training / evaluation code for InternVL2 series
 - [x] Release Streamlit web UI for InternVL1.5 and InternVL2
@@ -295,14 +298,14 @@ We welcome everyone to use our API for research. For better management, please s
 
   ViT-22B uses the private JFT-3B dataset.
 
-  | method              | #param | IN-1K | IN-ReaL | IN-V2 | IN-A  | IN-R  | IN-Sketch |
-  | ------------------- | :----: | :---: | :-----: | :---: | :---: | :---: | :-------: |
-  | OpenCLIP-G          |  1.8B  | 86.2  |  89.4   | 77.2  | 63.8  | 87.8  |   66.4    |
-  | DINOv2-g            |  1.1B  | 86.5  |  89.6   | 78.4  | 75.9  | 78.8  |   62.5    |
-  | EVA-01-CLIP-g       |  1.1B  | 86.5  |  89.3   | 77.4  | 70.5  | 87.7  |   63.1    |
-  | MAWS-ViT-6.5B       |  6.5B  | 87.8  |    -    |   -   |   -   |   -   |     -     |
-  | ViT-22B\*           | 21.7B  | 89.5  |  90.9   | 83.2  | 83.8  | 87.4  |     -     |
-  | InternViT-6B (ours) |  5.9B  | 88.2  |  90.4   | 79.9  | 77.5  | 89.8  |   69.1    |
+  | method              | #param | IN-1K | IN-ReaL | IN-V2 | IN-A | IN-R | IN-Sketch |
+  | ------------------- | :----: | :---: | :-----: | :---: | :--: | :--: | :-------: |
+  | OpenCLIP-G          |  1.8B  | 86.2  |  89.4   | 77.2  | 63.8 | 87.8 |   66.4    |
+  | DINOv2-g            |  1.1B  | 86.5  |  89.6   | 78.4  | 75.9 | 78.8 |   62.5    |
+  | EVA-01-CLIP-g       |  1.1B  | 86.5  |  89.3   | 77.4  | 70.5 | 87.7 |   63.1    |
+  | MAWS-ViT-6.5B       |  6.5B  | 87.8  |    -    |   -   |  -   |  -   |     -     |
+  | ViT-22B\*           | 21.7B  | 89.5  |  90.9   | 83.2  | 83.8 | 87.4 |     -     |
+  | InternViT-6B (ours) |  5.9B  | 88.2  |  90.4   | 79.9  | 77.5 | 89.8 |   69.1    |
 
 - Semantic Segmentation [\[see details\]](./segmentation#-evaluation)
 
@@ -318,12 +321,12 @@ We welcome everyone to use our API for research. For better management, please s
 
 - Zero-Shot Image Classification [\[see details\]](./clip_benchmark#imagenet-variants-and-objectnet)
 
-  | method            | IN-1K | IN-A  | IN-R  | IN-V2 | IN-Sketch | ObjectNet |
-  | ----------------- | :---: | :---: | :---: | :---: | :-------: | :-------: |
-  | OpenCLIP-G        | 80.1  | 69.3  | 92.1  | 73.6  |   68.9    |   73.0    |
-  | EVA-02-CLIP-E+    | 82.0  | 82.1  | 94.5  | 75.7  |   71.6    |   79.6    |
-  | ViT-22B\*         | 85.9  | 90.1  | 96.0  | 80.9  |     -     |   87.6    |
-  | InternVL-C (ours) | 83.2  | 83.8  | 95.5  | 77.3  |   73.9    |   80.6    |
+  | method            | IN-1K | IN-A | IN-R | IN-V2 | IN-Sketch | ObjectNet |
+  | ----------------- | :---: | :--: | :--: | :---: | :-------: | :-------: |
+  | OpenCLIP-G        | 80.1  | 69.3 | 92.1 | 73.6  |   68.9    |   73.0    |
+  | EVA-02-CLIP-E+    | 82.0  | 82.1 | 94.5 | 75.7  |   71.6    |   79.6    |
+  | ViT-22B\*         | 85.9  | 90.1 | 96.0 | 80.9  |     -     |   87.6    |
+  | InternVL-C (ours) | 83.2  | 83.8 | 95.5 | 77.3  |   73.9    |   80.6    |
 
 - Multilingual Zero-Shot Image Classification [\[see details\]](./clip_benchmark#multilingual-imagenet-1k)
 
@@ -341,13 +344,13 @@ We welcome everyone to use our API for research. For better management, please s
 
 - Zero-Shot Video Classification
 
-  | method            | #frame | K400  | K600  | K700  |
-  | ----------------- | :----: | :---: | :---: | :---: |
-  | OpenCLIP-G        |   1    | 65.9  | 66.1  | 59.2  |
-  | EVA-02-CLIP-E+    |   1    | 69.8  | 69.3  | 63.4  |
-  | InternVL-C (ours) |   1    | 71.0  | 71.3  | 65.7  |
-  | ViCLIP            |   8    | 75.7  | 73.5  | 66.4  |
-  | InternVL-C (ours) |   8    | 79.4  | 78.8  | 71.5  |
+  | method            | #frame | K400 | K600 | K700 |
+  | ----------------- | :----: | :--: | :--: | :--: |
+  | OpenCLIP-G        |   1    | 65.9 | 66.1 | 59.2 |
+  | EVA-02-CLIP-E+    |   1    | 69.8 | 69.3 | 63.4 |
+  | InternVL-C (ours) |   1    | 71.0 | 71.3 | 65.7 |
+  | ViCLIP            |   8    | 75.7 | 73.5 | 66.4 |
+  | InternVL-C (ours) |   8    | 79.4 | 78.8 | 71.5 |
 
 </details>
 
@@ -570,12 +573,12 @@ We welcome everyone to use our API for research. For better management, please s
 
 - Multilingual Zero-Shot Image-Text Retrieval on XTD [\[see details\]](./clip_benchmark#xtd)
 
-  | method            |  EN   |  ES   |  FR   |  ZH   |  IT   |  KO   |  RU   |  JP   | average |
-  | ----------------- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :-----: |
-  | AltCLIP           | 95.4  | 94.1  | 92.9  | 95.1  | 94.2  | 94.4  | 91.8  | 91.7  |  93.7   |
-  | OpenCLIP-XLM-R-H  | 97.3  | 96.1  | 94.5  | 94.7  | 96.0  | 90.2  | 93.9  | 94.0  |  94.6   |
-  | InternVL-C (ours) | 97.3  | 95.7  | 95.1  | 95.6  | 96.0  | 92.2  | 93.3  | 95.5  |  95.1   |
-  | InternVL-G (ours) | 98.6  | 97.7  | 96.5  | 96.7  | 96.9  | 95.1  | 94.8  | 96.1  |  96.6   |
+  | method            |  EN  |  ES  |  FR  |  ZH  |  IT  |  KO  |  RU  |  JP  | average |
+  | ----------------- | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-----: |
+  | AltCLIP           | 95.4 | 94.1 | 92.9 | 95.1 | 94.2 | 94.4 | 91.8 | 91.7 |  93.7   |
+  | OpenCLIP-XLM-R-H  | 97.3 | 96.1 | 94.5 | 94.7 | 96.0 | 90.2 | 93.9 | 94.0 |  94.6   |
+  | InternVL-C (ours) | 97.3 | 95.7 | 95.1 | 95.6 | 96.0 | 92.2 | 93.3 | 95.5 |  95.1   |
+  | InternVL-G (ours) | 98.6 | 97.7 | 96.5 | 96.7 | 96.9 | 95.1 | 94.8 | 96.1 |  96.6   |
 
 </details>
 
 
@@ -24,7 +24,7 @@
 
 ## 最新消息 🚀🚀🚀
 
-- `2024/11/14`: 我们发布了 [MMPR](https://huggingface.co/datasets/OpenGVLab/MMPR) 数据集，这是一个高质量的大规模多模态偏好数据集，并提出了一种新的更高效的偏好优化算法[MPO](https://github.com/OpenGVLab/InternVL/tree/main/internvl_chat/shell/internvl2.0_mpo)。基于该数据和算法训练得到的模型 [InternVL2-8B-MPO](https://huggingface.co/OpenGVLab/InternVL2-8B-MPO) 在 MathVista 上取得了67.0%的准确率。有关更多细节，请查看我们的[论文](https://arxiv.org/abs/2411.10442)、[项目页面](https://internvl.github.io/blog/2024-11-14-InternVL-2.0-MPO/)和[文档](https://internvl.readthedocs.io/en/latest/internvl2.0/preference_optimization.html)。
+- `2024/11/14`: 我们发布了 [MMPR](https://huggingface.co/datasets/OpenGVLab/MMPR)，一个高质量、大规模的多模态推理偏好数据集，以及 [MPO](https://github.com/OpenGVLab/InternVL/tree/main/internvl_chat/shell/internvl2.0_mpo)，一种高效的偏好优化算法。由此训练的模型 [InternVL2-8B-MPO](https://huggingface.co/OpenGVLab/InternVL2-8B-MPO) 在 MathVista 上取得了 67.0 的准确率。更多详情请参阅我们的[论文](https://arxiv.org/abs/2411.10442)、[项目主页](https://internvl.github.io/blog/2024-11-14-InternVL-2.0-MPO/) 和 [文档](https://internvl.readthedocs.io/en/latest/internvl2.0/preference_optimization.html)。
 - `2024/10/21`: 我们发布了 Mini-InternVL 系列。这些模型在保持极小模型体积的同时实现了出色的性能：4B 模型仅用 5% 的模型大小便达到了 90% 的性能。有关更多详细信息，请查看我们的 [项目页面](https://github.com/OpenGVLab/InternVL/tree/main/internvl_chat/shell/mini_internvl) 和 [文档](https://internvl.readthedocs.io/en/latest/internvl2.0/domain_adaptation.html)。
 - `2024/08/01`: [Chartmimic](https://chartmimic.github.io/) 团队在他们的基准测试中评估了 InternVL2 系列模型。InternVL2-26B 和 76B 模型在开源模型中取得了前两名的成绩，其中 InternVL2-Llama3-76B 模型超过了 GeminiProVision，并表现出与 Claude-3-opus 相当的结果。
 - `2024/08/01`: InternVL2-Pro 在 [CharXiv](https://charxiv.github.io/#leaderboard) 数据集中实现了开源模型中的 SOTA 性能，也比部分知名闭源模型如 GPT-4V、Gemini 1.5 Flash、Claude 3 Sonnet 取得了更好成绩
@@ -45,11 +45,14 @@
 
 ## TODO 列表
 
+- [x] 支持 liger kernels 以节省显存
+- [x] 发布 MPO 的代码、模型和数据
+- [x] 支持多模态 packed dataset
 - [ ] 支持 vLLM 和 Ollama
-- [x] 使用 readthedocs 重新构建文档
-- [x] 支持使用 LoRA 微调不同的 LLMs
 - [ ] 在 Demo 中支持视频和 PDF 输入
 - [ ] 发布集成 VisionLLMv2 的 InternVL2
+- [x] 使用 readthedocs 重新构建文档
+- [x] 支持使用 LoRA 微调不同的 LLMs
 - [x] 发布 InternVL2 的 `requirements.txt`
 - [x] 发布 InternVL2 系列的训练 / 评估代码
 - [x] 发布 InternVL1.5 和 InternVL2 的 Streamlit 网页 UI
@@ -295,14 +298,14 @@
 
   ViT-22B uses the private JFT-3B dataset.
 
-  | method              | #param | IN-1K | IN-ReaL | IN-V2 | IN-A  | IN-R  | IN-Sketch |
-  | ------------------- | :----: | :---: | :-----: | :---: | :---: | :---: | :-------: |
-  | OpenCLIP-G          |  1.8B  | 86.2  |  89.4   | 77.2  | 63.8  | 87.8  |   66.4    |
-  | DINOv2-g            |  1.1B  | 86.5  |  89.6   | 78.4  | 75.9  | 78.8  |   62.5    |
-  | EVA-01-CLIP-g       |  1.1B  | 86.5  |  89.3   | 77.4  | 70.5  | 87.7  |   63.1    |
-  | MAWS-ViT-6.5B       |  6.5B  | 87.8  |    -    |   -   |   -   |   -   |     -     |
-  | ViT-22B\*           | 21.7B  | 89.5  |  90.9   | 83.2  | 83.8  | 87.4  |     -     |
-  | InternViT-6B (ours) |  5.9B  | 88.2  |  90.4   | 79.9  | 77.5  | 89.8  |   69.1    |
+  | method              | #param | IN-1K | IN-ReaL | IN-V2 | IN-A | IN-R | IN-Sketch |
+  | ------------------- | :----: | :---: | :-----: | :---: | :--: | :--: | :-------: |
+  | OpenCLIP-G          |  1.8B  | 86.2  |  89.4   | 77.2  | 63.8 | 87.8 |   66.4    |
+  | DINOv2-g            |  1.1B  | 86.5  |  89.6   | 78.4  | 75.9 | 78.8 |   62.5    |
+  | EVA-01-CLIP-g       |  1.1B  | 86.5  |  89.3   | 77.4  | 70.5 | 87.7 |   63.1    |
+  | MAWS-ViT-6.5B       |  6.5B  | 87.8  |    -    |   -   |  -   |  -   |     -     |
+  | ViT-22B\*           | 21.7B  | 89.5  |  90.9   | 83.2  | 83.8 | 87.4 |     -     |
+  | InternViT-6B (ours) |  5.9B  | 88.2  |  90.4   | 79.9  | 77.5 | 89.8 |   69.1    |
 
 - 语义分割 [\[查看详情\]](./segmentation#-evaluation)
 
@@ -318,12 +321,12 @@
 
 - 零样本图像分类 [\[查看详情\]](./clip_benchmark#imagenet-variants-and-objectnet)
 
-  | method            | IN-1K | IN-A  | IN-R  | IN-V2 | IN-Sketch | ObjectNet |
-  | ----------------- | :---: | :---: | :---: | :---: | :-------: | :-------: |
-  | OpenCLIP-G        | 80.1  | 69.3  | 92.1  | 73.6  |   68.9    |   73.0    |
-  | EVA-02-CLIP-E+    | 82.0  | 82.1  | 94.5  | 75.7  |   71.6    |   79.6    |
-  | ViT-22B\*         | 85.9  | 90.1  | 96.0  | 80.9  |     -     |   87.6    |
-  | InternVL-C (ours) | 83.2  | 83.8  | 95.5  | 77.3  |   73.9    |   80.6    |
+  | method            | IN-1K | IN-A | IN-R | IN-V2 | IN-Sketch | ObjectNet |
+  | ----------------- | :---: | :--: | :--: | :---: | :-------: | :-------: |
+  | OpenCLIP-G        | 80.1  | 69.3 | 92.1 | 73.6  |   68.9    |   73.0    |
+  | EVA-02-CLIP-E+    | 82.0  | 82.1 | 94.5 | 75.7  |   71.6    |   79.6    |
+  | ViT-22B\*         | 85.9  | 90.1 | 96.0 | 80.9  |     -     |   87.6    |
+  | InternVL-C (ours) | 83.2  | 83.8 | 95.5 | 77.3  |   73.9    |   80.6    |
 
 - 多语言零样本图像分类 [\[查看详情\]](./clip_benchmark#multilingual-imagenet-1k)
 
@@ -341,13 +344,13 @@
 
 - 零样本视频分类
 
-  | method            | #frame | K400  | K600  | K700  |
-  | ----------------- | :----: | :---: | :---: | :---: |
-  | OpenCLIP-G        |   1    | 65.9  | 66.1  | 59.2  |
-  | EVA-02-CLIP-E+    |   1    | 69.8  | 69.3  | 63.4  |
-  | InternVL-C (ours) |   1    | 71.0  | 71.3  | 65.7  |
-  | ViCLIP            |   8    | 75.7  | 73.5  | 66.4  |
-  | InternVL-C (ours) |   8    | 79.4  | 78.8  | 71.5  |
+  | method            | #frame | K400 | K600 | K700 |
+  | ----------------- | :----: | :--: | :--: | :--: |
+  | OpenCLIP-G        |   1    | 65.9 | 66.1 | 59.2 |
+  | EVA-02-CLIP-E+    |   1    | 69.8 | 69.3 | 63.4 |
+  | InternVL-C (ours) |   1    | 71.0 | 71.3 | 65.7 |
+  | ViCLIP            |   8    | 75.7 | 73.5 | 66.4 |
+  | InternVL-C (ours) |   8    | 79.4 | 78.8 | 71.5 |
 
 </details>
 
@@ -570,12 +573,12 @@
 
 - 多语言零样本图文对检索 [\[查看详情\]](./clip_benchmark#xtd)
 
-  | method            |  EN   |  ES   |  FR   |  ZH   |  IT   |  KO   |  RU   |  JP   | average |
-  | ----------------- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :-----: |
-  | AltCLIP           | 95.4  | 94.1  | 92.9  | 95.1  | 94.2  | 94.4  | 91.8  | 91.7  |  93.7   |
-  | OpenCLIP-XLM-R-H  | 97.3  | 96.1  | 94.5  | 94.7  | 96.0  | 90.2  | 93.9  | 94.0  |  94.6   |
-  | InternVL-C (ours) | 97.3  | 95.7  | 95.1  | 95.6  | 96.0  | 92.2  | 93.3  | 95.5  |  95.1   |
-  | InternVL-G (ours) | 98.6  | 97.7  | 96.5  | 96.7  | 96.9  | 95.1  | 94.8  | 96.1  |  96.6   |
+  | method            |  EN  |  ES  |  FR  |  ZH  |  IT  |  KO  |  RU  |  JP  | average |
+  | ----------------- | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-----: |
+  | AltCLIP           | 95.4 | 94.1 | 92.9 | 95.1 | 94.2 | 94.4 | 91.8 | 91.7 |  93.7   |
+  | OpenCLIP-XLM-R-H  | 97.3 | 96.1 | 94.5 | 94.7 | 96.0 | 90.2 | 93.9 | 94.0 |  94.6   |
+  | InternVL-C (ours) | 97.3 | 95.7 | 95.1 | 95.6 | 96.0 | 92.2 | 93.3 | 95.5 |  95.1   |
+  | InternVL-G (ours) | 98.6 | 97.7 | 96.5 | 96.7 | 96.9 | 95.1 | 94.8 | 96.1 |  96.6   |
 
 </details>
 
 
@@ -227,19 +227,18 @@ def evaluate_chat_model():
     parser.add_argument('--datasets', type=str, default='coco,flickr30k,nocaps')
     parser.add_argument('--batch-size', type=int, default=1)
     parser.add_argument('--num-workers', type=int, default=1)
-    parser.add_argument('--num-beams', type=int, default=5)
+    parser.add_argument('--num-beams', type=int, default=1)
     parser.add_argument('--temperature', type=float, default=0.0)
     parser.add_argument('--out-dir', type=str, default='results')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--dynamic', action='store_true')
     parser.add_argument('--max-num', type=int, default=6)
     parser.add_argument('--load-in-8bit', action='store_true')
-    parser.add_argument('--load-in-4bit', action='store_true')
     parser.add_argument('--auto', action='store_true')
     args = parser.parse_args()
 
     if not os.path.exists(args.out_dir):
-        os.makedirs(args.out_dir)
+        os.makedirs(args.out_dir, exist_ok=True)
 
     args.datasets = args.datasets.split(',')
     print('datasets:', args.datasets)