diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..359bb53 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml diff --git a/.idea/MiniGPT-4.iml b/.idea/MiniGPT-4.iml new file mode 100644 index 0000000..8b8c395 --- /dev/null +++ b/.idea/MiniGPT-4.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..4448092 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,78 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..8ab91a0 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index 7aa29f2..5525986 100644 --- a/README.md +++ b/README.md @@ -1,65 +1,91 @@ -# MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models -[Deyao Zhu](https://tsutikgiau.github.io/)* (On Job Market!), [Jun Chen](https://junchen14.github.io/)* (On Job Market!), [Xiaoqian Shen](https://xiaoqian-shen.github.io), [Xiang Li](https://xiangli.ac.cn), and [Mohamed Elhoseiny](https://www.mohamed-elhoseiny.com/). *Equal Contribution +# miniGPT4 +[Chineses version](README_cn.md)

🚀🚀

+This project has modified demo.py and conversation.py to support direct text conversation without the need to upload an image first. -**King Abdullah University of Science and Technology** +**TODO: Support multiple images uploading.** - [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://www.youtube.com/watch?v=__tftoxpBAw&feature=youtu.be) +exmple: +![show](./examples/e5b0d467fa14e2aa9b77a46b828a4e0.png) +The following is the process for configuring the project environment. If you have already completed this step, you can skip the environment configuration and directly run demo.py. -## News -We now provide a pretrained MiniGPT-4 aligned with Vicuna-7B! The demo GPU memory consumption now can be as low as 12GB. +[Offcial repo](https://github.com/Vision-CAIR/MiniGPT-4) +supports two pretrained weights [13B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view) and [7B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view) can be used fintuning +**GPU Memory needed** -## Online Demo +* 13B: 23G +* 7B: 11.5G -Click the image to chat with MiniGPT-4 around your images -[![demo](figs/online_demo.png)](https://minigpt-4.github.io) +### Getting start +**0.Environment setup** +make sure you install docker first -## Examples - | | | -:-------------------------:|:-------------------------: -![find wild](figs/examples/wop_2.png) | ![write story](figs/examples/ad_2.png) -![solve problem](figs/examples/fix_1.png) | ![write Poem](figs/examples/rhyme_1.png) +pull up [docker image](https://hub.docker.com/r/bewithmeallmylife/mini-gpt4-runtime-cuda-10.2) +```commandline +docker pull bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0 +``` +Build the container, expose the corresponding port to enable launching the frontend UI for local use. +```commandline +nvidia-docker run -v /data:/projects -v /data2:/data2 -p 1118:7778 --shm-size 8G --name minigpt4 -d bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0 tail -f /dev/null +``` +Get into docker +```commandline +docker exec -it minigpt4 bash +``` +conda environment **mini-gpt4** already have +```commandline +conda activate mini-gpt4 +``` +note: make sure torch is compatible with your cuda version, in this image, the torch version is 1.12.1+cu10.2 +if your cuda is above 11, then execute this step. -More examples can be found in the [project page](https://minigpt-4.github.io). - - - -## Introduction -- MiniGPT-4 aligns a frozen visual encoder from BLIP-2 with a frozen LLM, Vicuna, using just one projection layer. -- We train MiniGPT-4 with two stages. The first traditional pretraining stage is trained using roughly 5 million aligned image-text pairs in 10 hours using 4 A100s. After the first stage, Vicuna is able to understand the image. But the generation ability of Vicuna is heavilly impacted. -- To address this issue and improve usability, we propose a novel way to create high-quality image-text pairs by the model itself and ChatGPT together. Based on this, we then create a small (3500 pairs in total) yet high-quality dataset. -- The second finetuning stage is trained on this dataset in a conversation template to significantly improve its generation reliability and overall usability. To our surprise, this stage is computationally efficient and takes only around 7 minutes with a single A100. -- MiniGPT-4 yields many emerging vision-language capabilities similar to those demonstrated in GPT-4. - - -![overview](figs/overview.png) - - -## Getting Started -### Installation - -**1. Prepare the code and the environment** - -Git clone our repository, creating a python environment and ativate it via the following command - -```bash -git clone https://github.com/Vision-CAIR/MiniGPT-4.git -cd MiniGPT-4 -conda env create -f environment.yml -conda activate minigpt4 +The inference environment required for miniGPT4 is already included in the image, with PyTorch version 1.12.1+cu10.2, which does not support the sm86 architecture. If the GPU model is RTX A6000 with an architecture of 8.6, a version that supports this architecture such as torch1.12.1+cu11.3 needs to be reinstalled. +```commandline +conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch ``` +**1.Prepare the pretrained model weights.** -**2. Prepare the pretrained Vicuna weights** - -The current version of MiniGPT-4 is built on the v0 versoin of Vicuna-13B. -Please refer to our instruction [here](PrepareVicuna.md) -to prepare the Vicuna weights. -The final weights would be in a single folder in a structure similar to the following: +A total of three pretrained model weights need to be prepared: **vicuna** (7B/14G), **llama** (7B/12.5G), and **miniGPT4** (7B). +* vicuna +Pretrained weights in two sizes, 13B and 7B, are available for download. +```commandline +git lfs install +git clone https://huggingface.co/lmsys/vicuna-13b-delta-v0 # more powerful, need at least 24G gpu memory +# or +git clone https://huggingface.co/lmsys/vicuna-7b-delta-v0 # smaller, need 12G gpu memory ``` +These two weights are not working, we still need llama pretrained wieght. +* llama + +To download the llama weights, which are not officially available for download, a third-party download form is used. Choose between 7B or 13B. +```commandline +wget https://agi.gpt4.org/llama/LLaMA/tokenizer.model -O ./tokenizer.model +wget https://agi.gpt4.org/llama/LLaMA/tokenizer_checklist.chk -O ./tokenizer_checklist.chk +wget https://agi.gpt4.org/llama/LLaMA/7B/consolidated.00.pth -O ./7B/consolidated.00.pth +wget https://agi.gpt4.org/llama/LLaMA/7B/params.json -O ./7B/params.json +wget https://agi.gpt4.org/llama/LLaMA/7B/checklist.chk -O ./7B/checklist.chk +wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.00.pth -O ./13B/consolidated.00.pth +wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.01.pth -O ./13B/consolidated.01.pth +wget https://agi.gpt4.org/llama/LLaMA/13B/params.json -O ./13B/params.json +wget https://agi.gpt4.org/llama/LLaMA/13B/checklist.chk -O ./13B/checklist.chk +``` +After downloading llama, we need to transform it to Huggingface format +```commandline +git clone https://github.com/huggingface/transformers.git +python transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py \ + --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path/to/llama-13bOR7b-hf/ +``` +After both the vicuna and llama weights are prepared, they need to be combined to obtain the vicuna weights that can be used. +```commandline +pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10 +python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/ --target /path/to/save/working/vicuna/weight/ --delta /path/to/vicuna-13bOR7b-delta-v0/ +``` +Finally get a working weight, the weight folder is like below: +```commandline vicuna_weights ├── config.json ├── generation_config.json @@ -67,104 +93,16 @@ vicuna_weights ├── pytorch_model-00001-of-00003.bin ... ``` +Then, set the path to the vicuna weight in the model config file [here](minigpt4/configs/models/minigpt4.yaml) at Line 16. +* minigpt4 checkpoints -Then, set the path to the vicuna weight in the model config file -[here](minigpt4/configs/models/minigpt4.yaml#L16) at Line 16. +[13B checkpoint](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) +[7B checkpoint](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) -**3. Prepare the pretrained MiniGPT-4 checkpoint** - -Download the pretrained checkpoints according to the Vicuna model you prepare. - -| Checkpoint Aligned with Vicuna 13B | Checkpoint Aligned with Vicuna 7B | -:------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------: - [Downlad](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) +Then, set the path to the pretrained checkpoint in the evaluation config file in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) at Line 11. -Then, set the path to the pretrained checkpoint in the evaluation config file -in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 11. - - - -### Launching Demo Locally - -Try out our demo [demo.py](demo.py) on your local machine by running - -``` +**2.run demo.py** +```commandline python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0 ``` - -To save GPU memory, Vicuna loads as 8 bit by default, with a beam search width of 1. -This configuration requires about 23G GPU memory for Vicuna 13B and 11.5G GPU memory for Vicuna 7B. -For more powerful GPUs, you can run the model -in 16 bit by setting low_resource to False in the config file -[minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) and use a larger beam search width. - -Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run our code on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) - - -### Training -The training of MiniGPT-4 contains two alignment stages. - -**1. First pretraining stage** - -In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets -to align the vision and language model. To download and prepare the datasets, please check -our [first stage dataset preparation instruction](dataset/README_1_STAGE.md). -After the first stage, the visual features are mapped and can be understood by the language -model. -To launch the first stage training, run the following command. In our experiments, we use 4 A100. -You can change the save path in the config file -[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml) - -```bash -torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml -``` - -A MiniGPT-4 checkpoint with only stage one training can be downloaded -[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link). -Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently. - - -**2. Second finetuning stage** - -In the second stage, we use a small high quality image-text pair dataset created by ourselves -and convert it to a conversation format to further align MiniGPT-4. -To download and prepare our second stage dataset, please check our -[second stage dataset preparation instruction](dataset/README_2_STAGE.md). -To launch the second stage alignment, -first specify the path to the checkpoint file trained in stage 1 in -[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml). -You can also specify the output path there. -Then, run the following command. In our experiments, we use 1 A100. - -```bash -torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml -``` - -After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly. - - - - -## Acknowledgement - -+ [BLIP2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) The model architecture of MiniGPT-4 follows BLIP-2. Don't forget to check this great open-source work if you don't know it before! -+ [Lavis](https://github.com/salesforce/LAVIS) This repository is built upon Lavis! -+ [Vicuna](https://github.com/lm-sys/FastChat) The fantastic language ability of Vicuna with only 13B parameters is just amazing. And it is open-source! - - -If you're using MiniGPT-4 in your research or applications, please cite using this BibTeX: -```bibtex -@article{zhu2023minigpt, - title={MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models}, - author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed}, - journal={arXiv preprint arXiv:2304.10592}, - year={2023} -} -``` - - -## License -This repository is under [BSD 3-Clause License](LICENSE.md). -Many codes are based on [Lavis](https://github.com/salesforce/LAVIS) with -BSD 3-Clause License [here](LICENSE_Lavis.md). diff --git a/README_cn.md b/README_cn.md new file mode 100644 index 0000000..65c9ada --- /dev/null +++ b/README_cn.md @@ -0,0 +1,100 @@ + +## miniGPT4

🚀🚀

+本项目修改了demo.py和conversation.py,能够支持直接文本对话,而无需先上传图片。 + +**TODO: 支持多图上传回答** + +演示: +![show](./examples/e5b0d467fa14e2aa9b77a46b828a4e0.png) + +以下是项目的环境配置过程,如果你已经配好了,跳过环境配置的阶段,直接运行demo.py即可 + +[官方](https://github.com/Vision-CAIR/MiniGPT-4) +提供参数量为[13B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view)和[7B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view)的checkpoint可供微调 + +**所需配置** + +官方使用A100显卡 +* 13B: 23G显存 +* 7B: 11.5G显存 + +### 步骤 +**0.环境准备** + +拉取已有docker[镜像](https://hub.docker.com/r/bewithmeallmylife/mini-gpt4-runtime-cuda-10.2) +```commandline +docker pull bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0 +``` +构建容器, 暴露对应端口,以便启动前端ui在本地使用 +```commandline +nvidia-docker run -v /data:/projects -v /data2:/data2 -p 1118:7778 --shm-size 8G --name minigpt4 -d bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0 tail -f /dev/null +``` +进入容器 +```commandline +docker exec -it minigpt4 bash +``` +启动conda虚拟环境**mini-gpt4** +```commandline +conda activate mini-gpt4 +``` +该镜像中miniGPT4所需的推理环境已有,pytorch版本为1.12.1+cu10.2,并不支持sm86的算力,如果显卡型号为RTX A6000,算力为8.6,需重新安装支持该算力的版本如torch1.12.1+cu11.3 +```commandline +conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch +``` + +**1.准备预训练的模型权重** + +一共需要准备3个预训练的模型权重 vicuna(7B/14G),llama(7B/12.5G),和miniGPT4(7B) +* vicuna +可下载13B和7B两种大小的预训练权重 +```commandline +git lfs install +git clone https://huggingface.co/lmsys/vicuna-13b-delta-v0 # more powerful, need at least 24G gpu memory +# or +git clone https://huggingface.co/lmsys/vicuna-7b-delta-v0 # smaller, need 12G gpu memory +``` +这两个权重还并非可用的,需搭配llama权重激活使用 +* 下载llama权重如下,官方并未开放下载,采用第三方下载形式, 选择7B或13B +```commandline +wget https://agi.gpt4.org/llama/LLaMA/tokenizer.model -O ./tokenizer.model +wget https://agi.gpt4.org/llama/LLaMA/tokenizer_checklist.chk -O ./tokenizer_checklist.chk +wget https://agi.gpt4.org/llama/LLaMA/7B/consolidated.00.pth -O ./7B/consolidated.00.pth +wget https://agi.gpt4.org/llama/LLaMA/7B/params.json -O ./7B/params.json +wget https://agi.gpt4.org/llama/LLaMA/7B/checklist.chk -O ./7B/checklist.chk +wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.00.pth -O ./13B/consolidated.00.pth +wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.01.pth -O ./13B/consolidated.01.pth +wget https://agi.gpt4.org/llama/LLaMA/13B/params.json -O ./13B/params.json +wget https://agi.gpt4.org/llama/LLaMA/13B/checklist.chk -O ./13B/checklist.chk +``` +下载完llama权重之后,还需要转换成huggingface的模型格式 +```commandline +git clone https://github.com/huggingface/transformers.git +python transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py \ + --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path/to/llama-13bOR7b-hf/ +``` +当vicuna和llama两个权重都准备好了之后,还需要将它们组合在一起得到能够使用得vicuna权重 +```commandline +pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10 +python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/ --target /path/to/save/working/vicuna/weight/ --delta /path/to/vicuna-13bOR7b-delta-v0/ +``` +最终获得一个可以使用的权重,它的文件格式如下: +```commandline +vicuna_weights +├── config.json +├── generation_config.json +├── pytorch_model.bin.index.json +├── pytorch_model-00001-of-00003.bin +... +``` +将该权重文件的路径添加到配置文件minigpt4/configs/models/minigpt4.yaml的第16行 +* minigpt4预训练权重下载 + +[13B的checkpoint](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) +[7B的checkpoint](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) + +将下好的权重路径加到配置文件eval_configs/minigpt4_eval.yaml的第11行 + +**2.运行demo.py** +```commandline +python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0 +``` diff --git a/README_official.md b/README_official.md new file mode 100644 index 0000000..7aa29f2 --- /dev/null +++ b/README_official.md @@ -0,0 +1,170 @@ +# MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models +[Deyao Zhu](https://tsutikgiau.github.io/)* (On Job Market!), [Jun Chen](https://junchen14.github.io/)* (On Job Market!), [Xiaoqian Shen](https://xiaoqian-shen.github.io), [Xiang Li](https://xiangli.ac.cn), and [Mohamed Elhoseiny](https://www.mohamed-elhoseiny.com/). *Equal Contribution + +**King Abdullah University of Science and Technology** + + [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://www.youtube.com/watch?v=__tftoxpBAw&feature=youtu.be) + + +## News +We now provide a pretrained MiniGPT-4 aligned with Vicuna-7B! The demo GPU memory consumption now can be as low as 12GB. + + +## Online Demo + +Click the image to chat with MiniGPT-4 around your images +[![demo](figs/online_demo.png)](https://minigpt-4.github.io) + + +## Examples + | | | +:-------------------------:|:-------------------------: +![find wild](figs/examples/wop_2.png) | ![write story](figs/examples/ad_2.png) +![solve problem](figs/examples/fix_1.png) | ![write Poem](figs/examples/rhyme_1.png) + +More examples can be found in the [project page](https://minigpt-4.github.io). + + + +## Introduction +- MiniGPT-4 aligns a frozen visual encoder from BLIP-2 with a frozen LLM, Vicuna, using just one projection layer. +- We train MiniGPT-4 with two stages. The first traditional pretraining stage is trained using roughly 5 million aligned image-text pairs in 10 hours using 4 A100s. After the first stage, Vicuna is able to understand the image. But the generation ability of Vicuna is heavilly impacted. +- To address this issue and improve usability, we propose a novel way to create high-quality image-text pairs by the model itself and ChatGPT together. Based on this, we then create a small (3500 pairs in total) yet high-quality dataset. +- The second finetuning stage is trained on this dataset in a conversation template to significantly improve its generation reliability and overall usability. To our surprise, this stage is computationally efficient and takes only around 7 minutes with a single A100. +- MiniGPT-4 yields many emerging vision-language capabilities similar to those demonstrated in GPT-4. + + +![overview](figs/overview.png) + + +## Getting Started +### Installation + +**1. Prepare the code and the environment** + +Git clone our repository, creating a python environment and ativate it via the following command + +```bash +git clone https://github.com/Vision-CAIR/MiniGPT-4.git +cd MiniGPT-4 +conda env create -f environment.yml +conda activate minigpt4 +``` + + +**2. Prepare the pretrained Vicuna weights** + +The current version of MiniGPT-4 is built on the v0 versoin of Vicuna-13B. +Please refer to our instruction [here](PrepareVicuna.md) +to prepare the Vicuna weights. +The final weights would be in a single folder in a structure similar to the following: + +``` +vicuna_weights +├── config.json +├── generation_config.json +├── pytorch_model.bin.index.json +├── pytorch_model-00001-of-00003.bin +... +``` + +Then, set the path to the vicuna weight in the model config file +[here](minigpt4/configs/models/minigpt4.yaml#L16) at Line 16. + +**3. Prepare the pretrained MiniGPT-4 checkpoint** + +Download the pretrained checkpoints according to the Vicuna model you prepare. + +| Checkpoint Aligned with Vicuna 13B | Checkpoint Aligned with Vicuna 7B | +:------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------: + [Downlad](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) + + +Then, set the path to the pretrained checkpoint in the evaluation config file +in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 11. + + + +### Launching Demo Locally + +Try out our demo [demo.py](demo.py) on your local machine by running + +``` +python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0 +``` + +To save GPU memory, Vicuna loads as 8 bit by default, with a beam search width of 1. +This configuration requires about 23G GPU memory for Vicuna 13B and 11.5G GPU memory for Vicuna 7B. +For more powerful GPUs, you can run the model +in 16 bit by setting low_resource to False in the config file +[minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) and use a larger beam search width. + +Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run our code on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) + + +### Training +The training of MiniGPT-4 contains two alignment stages. + +**1. First pretraining stage** + +In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets +to align the vision and language model. To download and prepare the datasets, please check +our [first stage dataset preparation instruction](dataset/README_1_STAGE.md). +After the first stage, the visual features are mapped and can be understood by the language +model. +To launch the first stage training, run the following command. In our experiments, we use 4 A100. +You can change the save path in the config file +[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml) + +```bash +torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml +``` + +A MiniGPT-4 checkpoint with only stage one training can be downloaded +[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link). +Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently. + + +**2. Second finetuning stage** + +In the second stage, we use a small high quality image-text pair dataset created by ourselves +and convert it to a conversation format to further align MiniGPT-4. +To download and prepare our second stage dataset, please check our +[second stage dataset preparation instruction](dataset/README_2_STAGE.md). +To launch the second stage alignment, +first specify the path to the checkpoint file trained in stage 1 in +[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml). +You can also specify the output path there. +Then, run the following command. In our experiments, we use 1 A100. + +```bash +torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml +``` + +After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly. + + + + +## Acknowledgement + ++ [BLIP2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) The model architecture of MiniGPT-4 follows BLIP-2. Don't forget to check this great open-source work if you don't know it before! ++ [Lavis](https://github.com/salesforce/LAVIS) This repository is built upon Lavis! ++ [Vicuna](https://github.com/lm-sys/FastChat) The fantastic language ability of Vicuna with only 13B parameters is just amazing. And it is open-source! + + +If you're using MiniGPT-4 in your research or applications, please cite using this BibTeX: +```bibtex +@article{zhu2023minigpt, + title={MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models}, + author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed}, + journal={arXiv preprint arXiv:2304.10592}, + year={2023} +} +``` + + +## License +This repository is under [BSD 3-Clause License](LICENSE.md). +Many codes are based on [Lavis](https://github.com/salesforce/LAVIS) with +BSD 3-Clause License [here](LICENSE_Lavis.md). diff --git a/demo.py b/demo.py index b3659f1..2c5927d 100644 --- a/demo.py +++ b/demo.py @@ -28,8 +28,8 @@ def parse_args(): "--options", nargs="+", help="override some settings in the used config, the key-value pair " - "in xxx=yyy format will be merged into config file (deprecate), " - "change to --cfg-options instead.", + "in xxx=yyy format will be merged into config file (deprecate), " + "change to --cfg-options instead.", ) args = parser.parse_args() return args @@ -64,6 +64,7 @@ vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id)) print('Initialization Finished') + # ======================================== # Gradio Setting # ======================================== @@ -73,7 +74,10 @@ def gradio_reset(chat_state, img_list): chat_state.messages = [] if img_list is not None: img_list = [] - return None, gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your image first', interactive=False),gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_list + return None, gr.update(value=None, interactive=True), gr.update(placeholder="chat with me", + interactive=True), gr.update( + value="Upload & Start Chat", interactive=True), chat_state, img_list + def upload_img(gr_img, text_input, chat_state): if gr_img is None: @@ -81,11 +85,16 @@ def upload_img(gr_img, text_input, chat_state): chat_state = CONV_VISION.copy() img_list = [] llm_message = chat.upload_img(gr_img, chat_state, img_list) - return gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list + return gr.update(interactive=True), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update( + value="Upload img", interactive=True), chat_state, img_list + def gradio_ask(user_message, chatbot, chat_state): if len(user_message) == 0: return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state + # chat_state = CONV_VISION.copy() + if chat_state == None: + chat_state = CONV_VISION.copy() chat.ask(user_message, chat_state) chatbot = chatbot + [[user_message, None]] return '', chatbot, chat_state @@ -101,12 +110,13 @@ def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature): chatbot[-1][1] = llm_message return chatbot, chat_state, img_list + title = """

Demo of MiniGPT-4

""" description = """

This is the demo of MiniGPT-4. Upload your images and start chatting!

""" article = """

""" -#TODO show examples below +# TODO show examples below with gr.Blocks() as demo: gr.Markdown(title) @@ -118,7 +128,7 @@ with gr.Blocks() as demo: image = gr.Image(type="pil") upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary") clear = gr.Button("Restart") - + num_beams = gr.Slider( minimum=1, maximum=10, @@ -127,7 +137,7 @@ with gr.Blocks() as demo: interactive=True, label="beam search numbers)", ) - + temperature = gr.Slider( minimum=0.1, maximum=2.0, @@ -141,13 +151,15 @@ with gr.Blocks() as demo: chat_state = gr.State() img_list = gr.State() chatbot = gr.Chatbot(label='MiniGPT-4') - text_input = gr.Textbox(label='User', placeholder='Please upload your image first', interactive=False) - - upload_button.click(upload_img, [image, text_input, chat_state], [image, text_input, upload_button, chat_state, img_list]) - + text_input = gr.Textbox(label='User', placeholder='chat with me', interactive=True) + + upload_button.click(upload_img, [image, text_input, chat_state], + [image, text_input, upload_button, chat_state, img_list]) + # print(img_list) text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then( gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list] ) - clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list], queue=False) + clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list], + queue=False) -demo.launch(share=True, enable_queue=True) +demo.launch(server_name="0.0.0.0", server_port=7778, share=True, enable_queue=True) diff --git a/examples/e5b0d467fa14e2aa9b77a46b828a4e0.png b/examples/e5b0d467fa14e2aa9b77a46b828a4e0.png new file mode 100644 index 0000000..a27f769 Binary files /dev/null and b/examples/e5b0d467fa14e2aa9b77a46b828a4e0.png differ diff --git a/minigpt4/conversation/conversation.py b/minigpt4/conversation/conversation.py index 676d89f..518d29d 100644 --- a/minigpt4/conversation/conversation.py +++ b/minigpt4/conversation/conversation.py @@ -117,7 +117,6 @@ CONV_VISION = Conversation( ) - class Chat: def __init__(self, model, vis_processor, device='cuda:0'): self.device = device @@ -131,6 +130,8 @@ class Chat: if len(conv.messages) > 0 and conv.messages[-1][0] == conv.roles[0] \ and conv.messages[-1][1][-6:] == '': # last message is image. conv.messages[-1][1] = ' '.join([conv.messages[-1][1], text]) + elif len(conv.messages) == 0: + conv.append_message(conv.roles[0], " " + text) else: conv.append_message(conv.roles[0], text) @@ -181,7 +182,7 @@ class Chat: if len(image.shape) == 3: image = image.unsqueeze(0) image = image.to(self.device) - + print(image.shape) image_emb, _ = self.model.encode_img(image) img_list.append(image_emb) conv.append_message(conv.roles[0], "") @@ -189,9 +190,16 @@ class Chat: # self.conv.append_message(self.conv.roles[1], msg) return msg - def get_context_emb(self, conv, img_list): + def get_context_emb(self, conv, img_list=None): + if img_list is None: + img = torch.zeros((1, 3, 224, 224)).to(self.device) + image_emb, _ = self.model.encode_img(img) + img_list = [image_emb] + prompt = conv.get_prompt() + print(prompt) prompt_segs = prompt.split('') + assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images." seg_tokens = [ self.model.llama_tokenizer( @@ -200,6 +208,7 @@ class Chat: for i, seg in enumerate(prompt_segs) ] seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens] + mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]] mixed_embs = torch.cat(mixed_embs, dim=1) return mixed_embs