update text direct conversation

This commit is contained in:
fuck 2023-05-12 19:04:53 +08:00
parent 22d8888ca2
commit 53da7fa871
12 changed files with 500 additions and 158 deletions

3
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# 默认忽略的文件
/shelf/
/workspace.xml

12
.idea/MiniGPT-4.iml Normal file
View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>

View File

@ -0,0 +1,78 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyCompatibilityInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ourVersions">
<value>
<list size="4">
<item index="0" class="java.lang.String" itemvalue="3.6" />
<item index="1" class="java.lang.String" itemvalue="3.7" />
<item index="2" class="java.lang.String" itemvalue="3.8" />
<item index="3" class="java.lang.String" itemvalue="3.11" />
</list>
</value>
</option>
</inspection_tool>
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="29">
<item index="0" class="java.lang.String" itemvalue="object_detection" />
<item index="1" class="java.lang.String" itemvalue="tensorflow_gpu" />
<item index="2" class="java.lang.String" itemvalue="keras_nightly" />
<item index="3" class="java.lang.String" itemvalue="torch" />
<item index="4" class="java.lang.String" itemvalue="torchvision" />
<item index="5" class="java.lang.String" itemvalue="opencv_python" />
<item index="6" class="java.lang.String" itemvalue="scipy" />
<item index="7" class="java.lang.String" itemvalue="thop" />
<item index="8" class="java.lang.String" itemvalue="opencv-python" />
<item index="9" class="java.lang.String" itemvalue="PyYAML" />
<item index="10" class="java.lang.String" itemvalue="ipython" />
<item index="11" class="java.lang.String" itemvalue="numpy" />
<item index="12" class="java.lang.String" itemvalue="requests" />
<item index="13" class="java.lang.String" itemvalue="psutil" />
<item index="14" class="java.lang.String" itemvalue="tqdm" />
<item index="15" class="java.lang.String" itemvalue="pandas" />
<item index="16" class="java.lang.String" itemvalue="tensorboard" />
<item index="17" class="java.lang.String" itemvalue="seaborn" />
<item index="18" class="java.lang.String" itemvalue="matplotlib" />
<item index="19" class="java.lang.String" itemvalue="Pillow" />
<item index="20" class="java.lang.String" itemvalue="shapely" />
<item index="21" class="java.lang.String" itemvalue="motmetrics" />
<item index="22" class="java.lang.String" itemvalue="sklearn" />
<item index="23" class="java.lang.String" itemvalue="setuptools" />
<item index="24" class="java.lang.String" itemvalue="lap" />
<item index="25" class="java.lang.String" itemvalue="Cython" />
<item index="26" class="java.lang.String" itemvalue="pycocotools" />
<item index="27" class="java.lang.String" itemvalue="terminaltables" />
<item index="28" class="java.lang.String" itemvalue="openpyxl" />
</list>
</value>
</option>
</inspection_tool>
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N803" />
</list>
</option>
</inspection_tool>
<inspection_tool class="PyStubPackagesAdvertiser" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<list>
<option value="PyQt5-stubs==5.15.6.0" />
</list>
</option>
</inspection_tool>
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredIdentifiers">
<list>
<option value="torch.cuda.amp" />
<option value="torch.utils.tensorboard" />
<option value="请求网页.*" />
<option value="tools.export_model.*" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/MiniGPT-4.iml" filepath="$PROJECT_DIR$/.idea/MiniGPT-4.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

222
README.md
View File

@ -1,65 +1,91 @@
# MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models
[Deyao Zhu](https://tsutikgiau.github.io/)* (On Job Market!), [Jun Chen](https://junchen14.github.io/)* (On Job Market!), [Xiaoqian Shen](https://xiaoqian-shen.github.io), [Xiang Li](https://xiangli.ac.cn), and [Mohamed Elhoseiny](https://www.mohamed-elhoseiny.com/). *Equal Contribution
# miniGPT4
[Chineses version](README_cn.md) <p>🚀🚀</p>
This project has modified demo.py and conversation.py to support direct text conversation without the need to upload an image first.
**King Abdullah University of Science and Technology**
**TODO: Support multiple images uploading.**
<a href='https://minigpt-4.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> <a href='https://arxiv.org/abs/2304.10592'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a> <a href='https://huggingface.co/spaces/Vision-CAIR/minigpt4'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> <a href='https://huggingface.co/Vision-CAIR/MiniGPT-4'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a> [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://www.youtube.com/watch?v=__tftoxpBAw&feature=youtu.be)
exmple
![show](./examples/e5b0d467fa14e2aa9b77a46b828a4e0.png)
The following is the process for configuring the project environment. If you have already completed this step, you can skip the environment configuration and directly run demo.py.
## News
We now provide a pretrained MiniGPT-4 aligned with Vicuna-7B! The demo GPU memory consumption now can be as low as 12GB.
[Offcial repo](https://github.com/Vision-CAIR/MiniGPT-4)
supports two pretrained weights [13B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view) and [7B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view) can be used fintuning
**GPU Memory needed**
## Online Demo
* 13B 23G
* 7B: 11.5G
Click the image to chat with MiniGPT-4 around your images
[![demo](figs/online_demo.png)](https://minigpt-4.github.io)
### Getting start
**0.Environment setup**
make sure you install docker first
## Examples
| | |
:-------------------------:|:-------------------------:
![find wild](figs/examples/wop_2.png) | ![write story](figs/examples/ad_2.png)
![solve problem](figs/examples/fix_1.png) | ![write Poem](figs/examples/rhyme_1.png)
pull up [docker image](https://hub.docker.com/r/bewithmeallmylife/mini-gpt4-runtime-cuda-10.2)
```commandline
docker pull bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0
```
Build the container, expose the corresponding port to enable launching the frontend UI for local use.
```commandline
nvidia-docker run -v /data:/projects -v /data2:/data2 -p 1118:7778 --shm-size 8G --name minigpt4 -d bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0 tail -f /dev/null
```
Get into docker
```commandline
docker exec -it minigpt4 bash
```
conda environment **mini-gpt4** already have
```commandline
conda activate mini-gpt4
```
note: make sure torch is compatible with your cuda version, in this image, the torch version is 1.12.1+cu10.2
if your cuda is above 11, then execute this step.
More examples can be found in the [project page](https://minigpt-4.github.io).
## Introduction
- MiniGPT-4 aligns a frozen visual encoder from BLIP-2 with a frozen LLM, Vicuna, using just one projection layer.
- We train MiniGPT-4 with two stages. The first traditional pretraining stage is trained using roughly 5 million aligned image-text pairs in 10 hours using 4 A100s. After the first stage, Vicuna is able to understand the image. But the generation ability of Vicuna is heavilly impacted.
- To address this issue and improve usability, we propose a novel way to create high-quality image-text pairs by the model itself and ChatGPT together. Based on this, we then create a small (3500 pairs in total) yet high-quality dataset.
- The second finetuning stage is trained on this dataset in a conversation template to significantly improve its generation reliability and overall usability. To our surprise, this stage is computationally efficient and takes only around 7 minutes with a single A100.
- MiniGPT-4 yields many emerging vision-language capabilities similar to those demonstrated in GPT-4.
![overview](figs/overview.png)
## Getting Started
### Installation
**1. Prepare the code and the environment**
Git clone our repository, creating a python environment and ativate it via the following command
```bash
git clone https://github.com/Vision-CAIR/MiniGPT-4.git
cd MiniGPT-4
conda env create -f environment.yml
conda activate minigpt4
The inference environment required for miniGPT4 is already included in the image, with PyTorch version 1.12.1+cu10.2, which does not support the sm86 architecture. If the GPU model is RTX A6000 with an architecture of 8.6, a version that supports this architecture such as torch1.12.1+cu11.3 needs to be reinstalled.
```commandline
conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
```
**1.Prepare the pretrained model weights.**
**2. Prepare the pretrained Vicuna weights**
The current version of MiniGPT-4 is built on the v0 versoin of Vicuna-13B.
Please refer to our instruction [here](PrepareVicuna.md)
to prepare the Vicuna weights.
The final weights would be in a single folder in a structure similar to the following:
A total of three pretrained model weights need to be prepared: **vicuna** (7B/14G), **llama** (7B/12.5G), and **miniGPT4** (7B).
* vicuna
Pretrained weights in two sizes, 13B and 7B, are available for download.
```commandline
git lfs install
git clone https://huggingface.co/lmsys/vicuna-13b-delta-v0 # more powerful, need at least 24G gpu memory
# or
git clone https://huggingface.co/lmsys/vicuna-7b-delta-v0 # smaller, need 12G gpu memory
```
These two weights are not working, we still need llama pretrained wieght.
* llama
To download the llama weights, which are not officially available for download, a third-party download form is used. Choose between 7B or 13B.
```commandline
wget https://agi.gpt4.org/llama/LLaMA/tokenizer.model -O ./tokenizer.model
wget https://agi.gpt4.org/llama/LLaMA/tokenizer_checklist.chk -O ./tokenizer_checklist.chk
wget https://agi.gpt4.org/llama/LLaMA/7B/consolidated.00.pth -O ./7B/consolidated.00.pth
wget https://agi.gpt4.org/llama/LLaMA/7B/params.json -O ./7B/params.json
wget https://agi.gpt4.org/llama/LLaMA/7B/checklist.chk -O ./7B/checklist.chk
wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.00.pth -O ./13B/consolidated.00.pth
wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.01.pth -O ./13B/consolidated.01.pth
wget https://agi.gpt4.org/llama/LLaMA/13B/params.json -O ./13B/params.json
wget https://agi.gpt4.org/llama/LLaMA/13B/checklist.chk -O ./13B/checklist.chk
```
After downloading llama, we need to transform it to Huggingface format
```commandline
git clone https://github.com/huggingface/transformers.git
python transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py \
--input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path/to/llama-13bOR7b-hf/
```
After both the vicuna and llama weights are prepared, they need to be combined to obtain the vicuna weights that can be used.
```commandline
pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10
python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/ --target /path/to/save/working/vicuna/weight/ --delta /path/to/vicuna-13bOR7b-delta-v0/
```
Finally get a working weight, the weight folder is like below
```commandline
vicuna_weights
├── config.json
├── generation_config.json
@ -67,104 +93,16 @@ vicuna_weights
├── pytorch_model-00001-of-00003.bin
...
```
Then, set the path to the vicuna weight in the model config file [here](minigpt4/configs/models/minigpt4.yaml) at Line 16.
* minigpt4 checkpoints
Then, set the path to the vicuna weight in the model config file
[here](minigpt4/configs/models/minigpt4.yaml#L16) at Line 16.
[13B checkpoint](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link)
[7B checkpoint](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)
**3. Prepare the pretrained MiniGPT-4 checkpoint**
Download the pretrained checkpoints according to the Vicuna model you prepare.
| Checkpoint Aligned with Vicuna 13B | Checkpoint Aligned with Vicuna 7B |
:------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:
[Downlad](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)
Then, set the path to the pretrained checkpoint in the evaluation config file in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) at Line 11.
Then, set the path to the pretrained checkpoint in the evaluation config file
in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 11.
### Launching Demo Locally
Try out our demo [demo.py](demo.py) on your local machine by running
```
**2.run demo.py**
```commandline
python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0
```
To save GPU memory, Vicuna loads as 8 bit by default, with a beam search width of 1.
This configuration requires about 23G GPU memory for Vicuna 13B and 11.5G GPU memory for Vicuna 7B.
For more powerful GPUs, you can run the model
in 16 bit by setting low_resource to False in the config file
[minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) and use a larger beam search width.
Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run our code on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing)
### Training
The training of MiniGPT-4 contains two alignment stages.
**1. First pretraining stage**
In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets
to align the vision and language model. To download and prepare the datasets, please check
our [first stage dataset preparation instruction](dataset/README_1_STAGE.md).
After the first stage, the visual features are mapped and can be understood by the language
model.
To launch the first stage training, run the following command. In our experiments, we use 4 A100.
You can change the save path in the config file
[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml)
```bash
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml
```
A MiniGPT-4 checkpoint with only stage one training can be downloaded
[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link).
Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently.
**2. Second finetuning stage**
In the second stage, we use a small high quality image-text pair dataset created by ourselves
and convert it to a conversation format to further align MiniGPT-4.
To download and prepare our second stage dataset, please check our
[second stage dataset preparation instruction](dataset/README_2_STAGE.md).
To launch the second stage alignment,
first specify the path to the checkpoint file trained in stage 1 in
[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml).
You can also specify the output path there.
Then, run the following command. In our experiments, we use 1 A100.
```bash
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml
```
After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly.
## Acknowledgement
+ [BLIP2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) The model architecture of MiniGPT-4 follows BLIP-2. Don't forget to check this great open-source work if you don't know it before!
+ [Lavis](https://github.com/salesforce/LAVIS) This repository is built upon Lavis!
+ [Vicuna](https://github.com/lm-sys/FastChat) The fantastic language ability of Vicuna with only 13B parameters is just amazing. And it is open-source!
If you're using MiniGPT-4 in your research or applications, please cite using this BibTeX:
```bibtex
@article{zhu2023minigpt,
title={MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models},
author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed},
journal={arXiv preprint arXiv:2304.10592},
year={2023}
}
```
## License
This repository is under [BSD 3-Clause License](LICENSE.md).
Many codes are based on [Lavis](https://github.com/salesforce/LAVIS) with
BSD 3-Clause License [here](LICENSE_Lavis.md).

100
README_cn.md Normal file
View File

@ -0,0 +1,100 @@
## miniGPT4 <p>🚀🚀</p>
本项目修改了demo.py和conversation.py能够支持直接文本对话而无需先上传图片。
**TODO: 支持多图上传回答**
演示:
![show](./examples/e5b0d467fa14e2aa9b77a46b828a4e0.png)
以下是项目的环境配置过程如果你已经配好了跳过环境配置的阶段直接运行demo.py即可
[官方](https://github.com/Vision-CAIR/MiniGPT-4)
提供参数量为[13B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view)和[7B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view)的checkpoint可供微调
**所需配置**
官方使用A100显卡
* 13B 23G显存
* 7B: 11.5G显存
### 步骤
**0.环境准备**
拉取已有docker[镜像](https://hub.docker.com/r/bewithmeallmylife/mini-gpt4-runtime-cuda-10.2)
```commandline
docker pull bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0
```
构建容器, 暴露对应端口以便启动前端ui在本地使用
```commandline
nvidia-docker run -v /data:/projects -v /data2:/data2 -p 1118:7778 --shm-size 8G --name minigpt4 -d bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0 tail -f /dev/null
```
进入容器
```commandline
docker exec -it minigpt4 bash
```
启动conda虚拟环境**mini-gpt4**
```commandline
conda activate mini-gpt4
```
该镜像中miniGPT4所需的推理环境已有pytorch版本为1.12.1+cu10.2并不支持sm86的算力如果显卡型号为RTX A6000算力为8.6需重新安装支持该算力的版本如torch1.12.1+cu11.3
```commandline
conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
```
**1.准备预训练的模型权重**
一共需要准备3个预训练的模型权重 vicuna7B/14Gllama7B/12.5G和miniGPT47B
* vicuna
可下载13B和7B两种大小的预训练权重
```commandline
git lfs install
git clone https://huggingface.co/lmsys/vicuna-13b-delta-v0 # more powerful, need at least 24G gpu memory
# or
git clone https://huggingface.co/lmsys/vicuna-7b-delta-v0 # smaller, need 12G gpu memory
```
这两个权重还并非可用的需搭配llama权重激活使用
* 下载llama权重如下官方并未开放下载采用第三方下载形式, 选择7B或13B
```commandline
wget https://agi.gpt4.org/llama/LLaMA/tokenizer.model -O ./tokenizer.model
wget https://agi.gpt4.org/llama/LLaMA/tokenizer_checklist.chk -O ./tokenizer_checklist.chk
wget https://agi.gpt4.org/llama/LLaMA/7B/consolidated.00.pth -O ./7B/consolidated.00.pth
wget https://agi.gpt4.org/llama/LLaMA/7B/params.json -O ./7B/params.json
wget https://agi.gpt4.org/llama/LLaMA/7B/checklist.chk -O ./7B/checklist.chk
wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.00.pth -O ./13B/consolidated.00.pth
wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.01.pth -O ./13B/consolidated.01.pth
wget https://agi.gpt4.org/llama/LLaMA/13B/params.json -O ./13B/params.json
wget https://agi.gpt4.org/llama/LLaMA/13B/checklist.chk -O ./13B/checklist.chk
```
下载完llama权重之后还需要转换成huggingface的模型格式
```commandline
git clone https://github.com/huggingface/transformers.git
python transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py \
--input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path/to/llama-13bOR7b-hf/
```
当vicuna和llama两个权重都准备好了之后还需要将它们组合在一起得到能够使用得vicuna权重
```commandline
pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10
python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/ --target /path/to/save/working/vicuna/weight/ --delta /path/to/vicuna-13bOR7b-delta-v0/
```
最终获得一个可以使用的权重,它的文件格式如下:
```commandline
vicuna_weights
├── config.json
├── generation_config.json
├── pytorch_model.bin.index.json
├── pytorch_model-00001-of-00003.bin
...
```
将该权重文件的路径添加到配置文件minigpt4/configs/models/minigpt4.yaml的第16行
* minigpt4预训练权重下载
[13B的checkpoint](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link)
[7B的checkpoint](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)
将下好的权重路径加到配置文件eval_configs/minigpt4_eval.yaml的第11行
**2.运行demo.py**
```commandline
python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0
```

170
README_official.md Normal file
View File

@ -0,0 +1,170 @@
# MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models
[Deyao Zhu](https://tsutikgiau.github.io/)* (On Job Market!), [Jun Chen](https://junchen14.github.io/)* (On Job Market!), [Xiaoqian Shen](https://xiaoqian-shen.github.io), [Xiang Li](https://xiangli.ac.cn), and [Mohamed Elhoseiny](https://www.mohamed-elhoseiny.com/). *Equal Contribution
**King Abdullah University of Science and Technology**
<a href='https://minigpt-4.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> <a href='https://arxiv.org/abs/2304.10592'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a> <a href='https://huggingface.co/spaces/Vision-CAIR/minigpt4'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> <a href='https://huggingface.co/Vision-CAIR/MiniGPT-4'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a> [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://www.youtube.com/watch?v=__tftoxpBAw&feature=youtu.be)
## News
We now provide a pretrained MiniGPT-4 aligned with Vicuna-7B! The demo GPU memory consumption now can be as low as 12GB.
## Online Demo
Click the image to chat with MiniGPT-4 around your images
[![demo](figs/online_demo.png)](https://minigpt-4.github.io)
## Examples
| | |
:-------------------------:|:-------------------------:
![find wild](figs/examples/wop_2.png) | ![write story](figs/examples/ad_2.png)
![solve problem](figs/examples/fix_1.png) | ![write Poem](figs/examples/rhyme_1.png)
More examples can be found in the [project page](https://minigpt-4.github.io).
## Introduction
- MiniGPT-4 aligns a frozen visual encoder from BLIP-2 with a frozen LLM, Vicuna, using just one projection layer.
- We train MiniGPT-4 with two stages. The first traditional pretraining stage is trained using roughly 5 million aligned image-text pairs in 10 hours using 4 A100s. After the first stage, Vicuna is able to understand the image. But the generation ability of Vicuna is heavilly impacted.
- To address this issue and improve usability, we propose a novel way to create high-quality image-text pairs by the model itself and ChatGPT together. Based on this, we then create a small (3500 pairs in total) yet high-quality dataset.
- The second finetuning stage is trained on this dataset in a conversation template to significantly improve its generation reliability and overall usability. To our surprise, this stage is computationally efficient and takes only around 7 minutes with a single A100.
- MiniGPT-4 yields many emerging vision-language capabilities similar to those demonstrated in GPT-4.
![overview](figs/overview.png)
## Getting Started
### Installation
**1. Prepare the code and the environment**
Git clone our repository, creating a python environment and ativate it via the following command
```bash
git clone https://github.com/Vision-CAIR/MiniGPT-4.git
cd MiniGPT-4
conda env create -f environment.yml
conda activate minigpt4
```
**2. Prepare the pretrained Vicuna weights**
The current version of MiniGPT-4 is built on the v0 versoin of Vicuna-13B.
Please refer to our instruction [here](PrepareVicuna.md)
to prepare the Vicuna weights.
The final weights would be in a single folder in a structure similar to the following:
```
vicuna_weights
├── config.json
├── generation_config.json
├── pytorch_model.bin.index.json
├── pytorch_model-00001-of-00003.bin
...
```
Then, set the path to the vicuna weight in the model config file
[here](minigpt4/configs/models/minigpt4.yaml#L16) at Line 16.
**3. Prepare the pretrained MiniGPT-4 checkpoint**
Download the pretrained checkpoints according to the Vicuna model you prepare.
| Checkpoint Aligned with Vicuna 13B | Checkpoint Aligned with Vicuna 7B |
:------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:
[Downlad](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)
Then, set the path to the pretrained checkpoint in the evaluation config file
in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 11.
### Launching Demo Locally
Try out our demo [demo.py](demo.py) on your local machine by running
```
python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0
```
To save GPU memory, Vicuna loads as 8 bit by default, with a beam search width of 1.
This configuration requires about 23G GPU memory for Vicuna 13B and 11.5G GPU memory for Vicuna 7B.
For more powerful GPUs, you can run the model
in 16 bit by setting low_resource to False in the config file
[minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) and use a larger beam search width.
Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run our code on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing)
### Training
The training of MiniGPT-4 contains two alignment stages.
**1. First pretraining stage**
In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets
to align the vision and language model. To download and prepare the datasets, please check
our [first stage dataset preparation instruction](dataset/README_1_STAGE.md).
After the first stage, the visual features are mapped and can be understood by the language
model.
To launch the first stage training, run the following command. In our experiments, we use 4 A100.
You can change the save path in the config file
[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml)
```bash
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml
```
A MiniGPT-4 checkpoint with only stage one training can be downloaded
[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link).
Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently.
**2. Second finetuning stage**
In the second stage, we use a small high quality image-text pair dataset created by ourselves
and convert it to a conversation format to further align MiniGPT-4.
To download and prepare our second stage dataset, please check our
[second stage dataset preparation instruction](dataset/README_2_STAGE.md).
To launch the second stage alignment,
first specify the path to the checkpoint file trained in stage 1 in
[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml).
You can also specify the output path there.
Then, run the following command. In our experiments, we use 1 A100.
```bash
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml
```
After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly.
## Acknowledgement
+ [BLIP2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) The model architecture of MiniGPT-4 follows BLIP-2. Don't forget to check this great open-source work if you don't know it before!
+ [Lavis](https://github.com/salesforce/LAVIS) This repository is built upon Lavis!
+ [Vicuna](https://github.com/lm-sys/FastChat) The fantastic language ability of Vicuna with only 13B parameters is just amazing. And it is open-source!
If you're using MiniGPT-4 in your research or applications, please cite using this BibTeX:
```bibtex
@article{zhu2023minigpt,
title={MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models},
author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed},
journal={arXiv preprint arXiv:2304.10592},
year={2023}
}
```
## License
This repository is under [BSD 3-Clause License](LICENSE.md).
Many codes are based on [Lavis](https://github.com/salesforce/LAVIS) with
BSD 3-Clause License [here](LICENSE_Lavis.md).

38
demo.py
View File

@ -28,8 +28,8 @@ def parse_args():
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
args = parser.parse_args()
return args
@ -64,6 +64,7 @@ vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config
chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
print('Initialization Finished')
# ========================================
# Gradio Setting
# ========================================
@ -73,7 +74,10 @@ def gradio_reset(chat_state, img_list):
chat_state.messages = []
if img_list is not None:
img_list = []
return None, gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your image first', interactive=False),gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_list
return None, gr.update(value=None, interactive=True), gr.update(placeholder="chat with me",
interactive=True), gr.update(
value="Upload & Start Chat", interactive=True), chat_state, img_list
def upload_img(gr_img, text_input, chat_state):
if gr_img is None:
@ -81,11 +85,16 @@ def upload_img(gr_img, text_input, chat_state):
chat_state = CONV_VISION.copy()
img_list = []
llm_message = chat.upload_img(gr_img, chat_state, img_list)
return gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list
return gr.update(interactive=True), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(
value="Upload img", interactive=True), chat_state, img_list
def gradio_ask(user_message, chatbot, chat_state):
if len(user_message) == 0:
return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
# chat_state = CONV_VISION.copy()
if chat_state == None:
chat_state = CONV_VISION.copy()
chat.ask(user_message, chat_state)
chatbot = chatbot + [[user_message, None]]
return '', chatbot, chat_state
@ -101,12 +110,13 @@ def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
chatbot[-1][1] = llm_message
return chatbot, chat_state, img_list
title = """<h1 align="center">Demo of MiniGPT-4</h1>"""
description = """<h3>This is the demo of MiniGPT-4. Upload your images and start chatting!</h3>"""
article = """<p><a href='https://minigpt-4.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/Vision-CAIR/MiniGPT-4'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://raw.githubusercontent.com/Vision-CAIR/MiniGPT-4/main/MiniGPT_4.pdf'><img src='https://img.shields.io/badge/Paper-PDF-red'></a></p>
"""
#TODO show examples below
# TODO show examples below
with gr.Blocks() as demo:
gr.Markdown(title)
@ -118,7 +128,7 @@ with gr.Blocks() as demo:
image = gr.Image(type="pil")
upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
clear = gr.Button("Restart")
num_beams = gr.Slider(
minimum=1,
maximum=10,
@ -127,7 +137,7 @@ with gr.Blocks() as demo:
interactive=True,
label="beam search numbers)",
)
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
@ -141,13 +151,15 @@ with gr.Blocks() as demo:
chat_state = gr.State()
img_list = gr.State()
chatbot = gr.Chatbot(label='MiniGPT-4')
text_input = gr.Textbox(label='User', placeholder='Please upload your image first', interactive=False)
upload_button.click(upload_img, [image, text_input, chat_state], [image, text_input, upload_button, chat_state, img_list])
text_input = gr.Textbox(label='User', placeholder='chat with me', interactive=True)
upload_button.click(upload_img, [image, text_input, chat_state],
[image, text_input, upload_button, chat_state, img_list])
# print(img_list)
text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
)
clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list], queue=False)
clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list],
queue=False)
demo.launch(share=True, enable_queue=True)
demo.launch(server_name="0.0.0.0", server_port=7778, share=True, enable_queue=True)

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

View File

@ -117,7 +117,6 @@ CONV_VISION = Conversation(
)
class Chat:
def __init__(self, model, vis_processor, device='cuda:0'):
self.device = device
@ -131,6 +130,8 @@ class Chat:
if len(conv.messages) > 0 and conv.messages[-1][0] == conv.roles[0] \
and conv.messages[-1][1][-6:] == '</Img>': # last message is image.
conv.messages[-1][1] = ' '.join([conv.messages[-1][1], text])
elif len(conv.messages) == 0:
conv.append_message(conv.roles[0], " <Img><ImageHere></Img> " + text)
else:
conv.append_message(conv.roles[0], text)
@ -181,7 +182,7 @@ class Chat:
if len(image.shape) == 3:
image = image.unsqueeze(0)
image = image.to(self.device)
print(image.shape)
image_emb, _ = self.model.encode_img(image)
img_list.append(image_emb)
conv.append_message(conv.roles[0], "<Img><ImageHere></Img>")
@ -189,9 +190,16 @@ class Chat:
# self.conv.append_message(self.conv.roles[1], msg)
return msg
def get_context_emb(self, conv, img_list):
def get_context_emb(self, conv, img_list=None):
if img_list is None:
img = torch.zeros((1, 3, 224, 224)).to(self.device)
image_emb, _ = self.model.encode_img(img)
img_list = [image_emb]
prompt = conv.get_prompt()
print(prompt)
prompt_segs = prompt.split('<ImageHere>')
assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
seg_tokens = [
self.model.llama_tokenizer(
@ -200,6 +208,7 @@ class Chat:
for i, seg in enumerate(prompt_segs)
]
seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
mixed_embs = torch.cat(mixed_embs, dim=1)
return mixed_embs