diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..359bb53
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
diff --git a/.idea/MiniGPT-4.iml b/.idea/MiniGPT-4.iml
new file mode 100644
index 0000000..8b8c395
--- /dev/null
+++ b/.idea/MiniGPT-4.iml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..4448092
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,78 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..8ab91a0
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
index 7aa29f2..5525986 100644
--- a/README.md
+++ b/README.md
@@ -1,65 +1,91 @@
-# MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models
-[Deyao Zhu](https://tsutikgiau.github.io/)* (On Job Market!), [Jun Chen](https://junchen14.github.io/)* (On Job Market!), [Xiaoqian Shen](https://xiaoqian-shen.github.io), [Xiang Li](https://xiangli.ac.cn), and [Mohamed Elhoseiny](https://www.mohamed-elhoseiny.com/). *Equal Contribution
+# miniGPT4
+[Chineses version](README_cn.md)
🚀🚀
+This project has modified demo.py and conversation.py to support direct text conversation without the need to upload an image first.
-**King Abdullah University of Science and Technology**
+**TODO: Support multiple images uploading.**
- [](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) [](https://www.youtube.com/watch?v=__tftoxpBAw&feature=youtu.be)
+exmple:
+
+The following is the process for configuring the project environment. If you have already completed this step, you can skip the environment configuration and directly run demo.py.
-## News
-We now provide a pretrained MiniGPT-4 aligned with Vicuna-7B! The demo GPU memory consumption now can be as low as 12GB.
+[Offcial repo](https://github.com/Vision-CAIR/MiniGPT-4)
+supports two pretrained weights [13B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view) and [7B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view) can be used fintuning
+**GPU Memory needed**
-## Online Demo
+* 13B: 23G
+* 7B: 11.5G
-Click the image to chat with MiniGPT-4 around your images
-[](https://minigpt-4.github.io)
+### Getting start
+**0.Environment setup**
+make sure you install docker first
-## Examples
- | | |
-:-------------------------:|:-------------------------:
- | 
- | 
+pull up [docker image](https://hub.docker.com/r/bewithmeallmylife/mini-gpt4-runtime-cuda-10.2)
+```commandline
+docker pull bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0
+```
+Build the container, expose the corresponding port to enable launching the frontend UI for local use.
+```commandline
+nvidia-docker run -v /data:/projects -v /data2:/data2 -p 1118:7778 --shm-size 8G --name minigpt4 -d bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0 tail -f /dev/null
+```
+Get into docker
+```commandline
+docker exec -it minigpt4 bash
+```
+conda environment **mini-gpt4** already have
+```commandline
+conda activate mini-gpt4
+```
+note: make sure torch is compatible with your cuda version, in this image, the torch version is 1.12.1+cu10.2
+if your cuda is above 11, then execute this step.
-More examples can be found in the [project page](https://minigpt-4.github.io).
-
-
-
-## Introduction
-- MiniGPT-4 aligns a frozen visual encoder from BLIP-2 with a frozen LLM, Vicuna, using just one projection layer.
-- We train MiniGPT-4 with two stages. The first traditional pretraining stage is trained using roughly 5 million aligned image-text pairs in 10 hours using 4 A100s. After the first stage, Vicuna is able to understand the image. But the generation ability of Vicuna is heavilly impacted.
-- To address this issue and improve usability, we propose a novel way to create high-quality image-text pairs by the model itself and ChatGPT together. Based on this, we then create a small (3500 pairs in total) yet high-quality dataset.
-- The second finetuning stage is trained on this dataset in a conversation template to significantly improve its generation reliability and overall usability. To our surprise, this stage is computationally efficient and takes only around 7 minutes with a single A100.
-- MiniGPT-4 yields many emerging vision-language capabilities similar to those demonstrated in GPT-4.
-
-
-
-
-
-## Getting Started
-### Installation
-
-**1. Prepare the code and the environment**
-
-Git clone our repository, creating a python environment and ativate it via the following command
-
-```bash
-git clone https://github.com/Vision-CAIR/MiniGPT-4.git
-cd MiniGPT-4
-conda env create -f environment.yml
-conda activate minigpt4
+The inference environment required for miniGPT4 is already included in the image, with PyTorch version 1.12.1+cu10.2, which does not support the sm86 architecture. If the GPU model is RTX A6000 with an architecture of 8.6, a version that supports this architecture such as torch1.12.1+cu11.3 needs to be reinstalled.
+```commandline
+conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
```
+**1.Prepare the pretrained model weights.**
-**2. Prepare the pretrained Vicuna weights**
-
-The current version of MiniGPT-4 is built on the v0 versoin of Vicuna-13B.
-Please refer to our instruction [here](PrepareVicuna.md)
-to prepare the Vicuna weights.
-The final weights would be in a single folder in a structure similar to the following:
+A total of three pretrained model weights need to be prepared: **vicuna** (7B/14G), **llama** (7B/12.5G), and **miniGPT4** (7B).
+* vicuna
+Pretrained weights in two sizes, 13B and 7B, are available for download.
+```commandline
+git lfs install
+git clone https://huggingface.co/lmsys/vicuna-13b-delta-v0 # more powerful, need at least 24G gpu memory
+# or
+git clone https://huggingface.co/lmsys/vicuna-7b-delta-v0 # smaller, need 12G gpu memory
```
+These two weights are not working, we still need llama pretrained wieght.
+* llama
+
+To download the llama weights, which are not officially available for download, a third-party download form is used. Choose between 7B or 13B.
+```commandline
+wget https://agi.gpt4.org/llama/LLaMA/tokenizer.model -O ./tokenizer.model
+wget https://agi.gpt4.org/llama/LLaMA/tokenizer_checklist.chk -O ./tokenizer_checklist.chk
+wget https://agi.gpt4.org/llama/LLaMA/7B/consolidated.00.pth -O ./7B/consolidated.00.pth
+wget https://agi.gpt4.org/llama/LLaMA/7B/params.json -O ./7B/params.json
+wget https://agi.gpt4.org/llama/LLaMA/7B/checklist.chk -O ./7B/checklist.chk
+wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.00.pth -O ./13B/consolidated.00.pth
+wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.01.pth -O ./13B/consolidated.01.pth
+wget https://agi.gpt4.org/llama/LLaMA/13B/params.json -O ./13B/params.json
+wget https://agi.gpt4.org/llama/LLaMA/13B/checklist.chk -O ./13B/checklist.chk
+```
+After downloading llama, we need to transform it to Huggingface format
+```commandline
+git clone https://github.com/huggingface/transformers.git
+python transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py \
+ --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path/to/llama-13bOR7b-hf/
+```
+After both the vicuna and llama weights are prepared, they need to be combined to obtain the vicuna weights that can be used.
+```commandline
+pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10
+python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/ --target /path/to/save/working/vicuna/weight/ --delta /path/to/vicuna-13bOR7b-delta-v0/
+```
+Finally get a working weight, the weight folder is like below:
+```commandline
vicuna_weights
├── config.json
├── generation_config.json
@@ -67,104 +93,16 @@ vicuna_weights
├── pytorch_model-00001-of-00003.bin
...
```
+Then, set the path to the vicuna weight in the model config file [here](minigpt4/configs/models/minigpt4.yaml) at Line 16.
+* minigpt4 checkpoints
-Then, set the path to the vicuna weight in the model config file
-[here](minigpt4/configs/models/minigpt4.yaml#L16) at Line 16.
+[13B checkpoint](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link)
+[7B checkpoint](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)
-**3. Prepare the pretrained MiniGPT-4 checkpoint**
-
-Download the pretrained checkpoints according to the Vicuna model you prepare.
-
-| Checkpoint Aligned with Vicuna 13B | Checkpoint Aligned with Vicuna 7B |
-:------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:
- [Downlad](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)
+Then, set the path to the pretrained checkpoint in the evaluation config file in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) at Line 11.
-Then, set the path to the pretrained checkpoint in the evaluation config file
-in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 11.
-
-
-
-### Launching Demo Locally
-
-Try out our demo [demo.py](demo.py) on your local machine by running
-
-```
+**2.run demo.py**
+```commandline
python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0
```
-
-To save GPU memory, Vicuna loads as 8 bit by default, with a beam search width of 1.
-This configuration requires about 23G GPU memory for Vicuna 13B and 11.5G GPU memory for Vicuna 7B.
-For more powerful GPUs, you can run the model
-in 16 bit by setting low_resource to False in the config file
-[minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) and use a larger beam search width.
-
-Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run our code on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing)
-
-
-### Training
-The training of MiniGPT-4 contains two alignment stages.
-
-**1. First pretraining stage**
-
-In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets
-to align the vision and language model. To download and prepare the datasets, please check
-our [first stage dataset preparation instruction](dataset/README_1_STAGE.md).
-After the first stage, the visual features are mapped and can be understood by the language
-model.
-To launch the first stage training, run the following command. In our experiments, we use 4 A100.
-You can change the save path in the config file
-[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml)
-
-```bash
-torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml
-```
-
-A MiniGPT-4 checkpoint with only stage one training can be downloaded
-[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link).
-Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently.
-
-
-**2. Second finetuning stage**
-
-In the second stage, we use a small high quality image-text pair dataset created by ourselves
-and convert it to a conversation format to further align MiniGPT-4.
-To download and prepare our second stage dataset, please check our
-[second stage dataset preparation instruction](dataset/README_2_STAGE.md).
-To launch the second stage alignment,
-first specify the path to the checkpoint file trained in stage 1 in
-[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml).
-You can also specify the output path there.
-Then, run the following command. In our experiments, we use 1 A100.
-
-```bash
-torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml
-```
-
-After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly.
-
-
-
-
-## Acknowledgement
-
-+ [BLIP2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) The model architecture of MiniGPT-4 follows BLIP-2. Don't forget to check this great open-source work if you don't know it before!
-+ [Lavis](https://github.com/salesforce/LAVIS) This repository is built upon Lavis!
-+ [Vicuna](https://github.com/lm-sys/FastChat) The fantastic language ability of Vicuna with only 13B parameters is just amazing. And it is open-source!
-
-
-If you're using MiniGPT-4 in your research or applications, please cite using this BibTeX:
-```bibtex
-@article{zhu2023minigpt,
- title={MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models},
- author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed},
- journal={arXiv preprint arXiv:2304.10592},
- year={2023}
-}
-```
-
-
-## License
-This repository is under [BSD 3-Clause License](LICENSE.md).
-Many codes are based on [Lavis](https://github.com/salesforce/LAVIS) with
-BSD 3-Clause License [here](LICENSE_Lavis.md).
diff --git a/README_cn.md b/README_cn.md
new file mode 100644
index 0000000..65c9ada
--- /dev/null
+++ b/README_cn.md
@@ -0,0 +1,100 @@
+
+## miniGPT4
🚀🚀
+本项目修改了demo.py和conversation.py,能够支持直接文本对话,而无需先上传图片。
+
+**TODO: 支持多图上传回答**
+
+演示:
+
+
+以下是项目的环境配置过程,如果你已经配好了,跳过环境配置的阶段,直接运行demo.py即可
+
+[官方](https://github.com/Vision-CAIR/MiniGPT-4)
+提供参数量为[13B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view)和[7B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view)的checkpoint可供微调
+
+**所需配置**
+
+官方使用A100显卡
+* 13B: 23G显存
+* 7B: 11.5G显存
+
+### 步骤
+**0.环境准备**
+
+拉取已有docker[镜像](https://hub.docker.com/r/bewithmeallmylife/mini-gpt4-runtime-cuda-10.2)
+```commandline
+docker pull bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0
+```
+构建容器, 暴露对应端口,以便启动前端ui在本地使用
+```commandline
+nvidia-docker run -v /data:/projects -v /data2:/data2 -p 1118:7778 --shm-size 8G --name minigpt4 -d bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0 tail -f /dev/null
+```
+进入容器
+```commandline
+docker exec -it minigpt4 bash
+```
+启动conda虚拟环境**mini-gpt4**
+```commandline
+conda activate mini-gpt4
+```
+该镜像中miniGPT4所需的推理环境已有,pytorch版本为1.12.1+cu10.2,并不支持sm86的算力,如果显卡型号为RTX A6000,算力为8.6,需重新安装支持该算力的版本如torch1.12.1+cu11.3
+```commandline
+conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
+```
+
+**1.准备预训练的模型权重**
+
+一共需要准备3个预训练的模型权重 vicuna(7B/14G),llama(7B/12.5G),和miniGPT4(7B)
+* vicuna
+可下载13B和7B两种大小的预训练权重
+```commandline
+git lfs install
+git clone https://huggingface.co/lmsys/vicuna-13b-delta-v0 # more powerful, need at least 24G gpu memory
+# or
+git clone https://huggingface.co/lmsys/vicuna-7b-delta-v0 # smaller, need 12G gpu memory
+```
+这两个权重还并非可用的,需搭配llama权重激活使用
+* 下载llama权重如下,官方并未开放下载,采用第三方下载形式, 选择7B或13B
+```commandline
+wget https://agi.gpt4.org/llama/LLaMA/tokenizer.model -O ./tokenizer.model
+wget https://agi.gpt4.org/llama/LLaMA/tokenizer_checklist.chk -O ./tokenizer_checklist.chk
+wget https://agi.gpt4.org/llama/LLaMA/7B/consolidated.00.pth -O ./7B/consolidated.00.pth
+wget https://agi.gpt4.org/llama/LLaMA/7B/params.json -O ./7B/params.json
+wget https://agi.gpt4.org/llama/LLaMA/7B/checklist.chk -O ./7B/checklist.chk
+wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.00.pth -O ./13B/consolidated.00.pth
+wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.01.pth -O ./13B/consolidated.01.pth
+wget https://agi.gpt4.org/llama/LLaMA/13B/params.json -O ./13B/params.json
+wget https://agi.gpt4.org/llama/LLaMA/13B/checklist.chk -O ./13B/checklist.chk
+```
+下载完llama权重之后,还需要转换成huggingface的模型格式
+```commandline
+git clone https://github.com/huggingface/transformers.git
+python transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py \
+ --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path/to/llama-13bOR7b-hf/
+```
+当vicuna和llama两个权重都准备好了之后,还需要将它们组合在一起得到能够使用得vicuna权重
+```commandline
+pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10
+python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/ --target /path/to/save/working/vicuna/weight/ --delta /path/to/vicuna-13bOR7b-delta-v0/
+```
+最终获得一个可以使用的权重,它的文件格式如下:
+```commandline
+vicuna_weights
+├── config.json
+├── generation_config.json
+├── pytorch_model.bin.index.json
+├── pytorch_model-00001-of-00003.bin
+...
+```
+将该权重文件的路径添加到配置文件minigpt4/configs/models/minigpt4.yaml的第16行
+* minigpt4预训练权重下载
+
+[13B的checkpoint](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link)
+[7B的checkpoint](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)
+
+将下好的权重路径加到配置文件eval_configs/minigpt4_eval.yaml的第11行
+
+**2.运行demo.py**
+```commandline
+python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0
+```
diff --git a/README_official.md b/README_official.md
new file mode 100644
index 0000000..7aa29f2
--- /dev/null
+++ b/README_official.md
@@ -0,0 +1,170 @@
+# MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models
+[Deyao Zhu](https://tsutikgiau.github.io/)* (On Job Market!), [Jun Chen](https://junchen14.github.io/)* (On Job Market!), [Xiaoqian Shen](https://xiaoqian-shen.github.io), [Xiang Li](https://xiangli.ac.cn), and [Mohamed Elhoseiny](https://www.mohamed-elhoseiny.com/). *Equal Contribution
+
+**King Abdullah University of Science and Technology**
+
+ [](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) [](https://www.youtube.com/watch?v=__tftoxpBAw&feature=youtu.be)
+
+
+## News
+We now provide a pretrained MiniGPT-4 aligned with Vicuna-7B! The demo GPU memory consumption now can be as low as 12GB.
+
+
+## Online Demo
+
+Click the image to chat with MiniGPT-4 around your images
+[](https://minigpt-4.github.io)
+
+
+## Examples
+ | | |
+:-------------------------:|:-------------------------:
+ | 
+ | 
+
+More examples can be found in the [project page](https://minigpt-4.github.io).
+
+
+
+## Introduction
+- MiniGPT-4 aligns a frozen visual encoder from BLIP-2 with a frozen LLM, Vicuna, using just one projection layer.
+- We train MiniGPT-4 with two stages. The first traditional pretraining stage is trained using roughly 5 million aligned image-text pairs in 10 hours using 4 A100s. After the first stage, Vicuna is able to understand the image. But the generation ability of Vicuna is heavilly impacted.
+- To address this issue and improve usability, we propose a novel way to create high-quality image-text pairs by the model itself and ChatGPT together. Based on this, we then create a small (3500 pairs in total) yet high-quality dataset.
+- The second finetuning stage is trained on this dataset in a conversation template to significantly improve its generation reliability and overall usability. To our surprise, this stage is computationally efficient and takes only around 7 minutes with a single A100.
+- MiniGPT-4 yields many emerging vision-language capabilities similar to those demonstrated in GPT-4.
+
+
+
+
+
+## Getting Started
+### Installation
+
+**1. Prepare the code and the environment**
+
+Git clone our repository, creating a python environment and ativate it via the following command
+
+```bash
+git clone https://github.com/Vision-CAIR/MiniGPT-4.git
+cd MiniGPT-4
+conda env create -f environment.yml
+conda activate minigpt4
+```
+
+
+**2. Prepare the pretrained Vicuna weights**
+
+The current version of MiniGPT-4 is built on the v0 versoin of Vicuna-13B.
+Please refer to our instruction [here](PrepareVicuna.md)
+to prepare the Vicuna weights.
+The final weights would be in a single folder in a structure similar to the following:
+
+```
+vicuna_weights
+├── config.json
+├── generation_config.json
+├── pytorch_model.bin.index.json
+├── pytorch_model-00001-of-00003.bin
+...
+```
+
+Then, set the path to the vicuna weight in the model config file
+[here](minigpt4/configs/models/minigpt4.yaml#L16) at Line 16.
+
+**3. Prepare the pretrained MiniGPT-4 checkpoint**
+
+Download the pretrained checkpoints according to the Vicuna model you prepare.
+
+| Checkpoint Aligned with Vicuna 13B | Checkpoint Aligned with Vicuna 7B |
+:------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:
+ [Downlad](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)
+
+
+Then, set the path to the pretrained checkpoint in the evaluation config file
+in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 11.
+
+
+
+### Launching Demo Locally
+
+Try out our demo [demo.py](demo.py) on your local machine by running
+
+```
+python demo.py --cfg-path eval_configs/minigpt4_eval.yaml --gpu-id 0
+```
+
+To save GPU memory, Vicuna loads as 8 bit by default, with a beam search width of 1.
+This configuration requires about 23G GPU memory for Vicuna 13B and 11.5G GPU memory for Vicuna 7B.
+For more powerful GPUs, you can run the model
+in 16 bit by setting low_resource to False in the config file
+[minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) and use a larger beam search width.
+
+Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run our code on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing)
+
+
+### Training
+The training of MiniGPT-4 contains two alignment stages.
+
+**1. First pretraining stage**
+
+In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets
+to align the vision and language model. To download and prepare the datasets, please check
+our [first stage dataset preparation instruction](dataset/README_1_STAGE.md).
+After the first stage, the visual features are mapped and can be understood by the language
+model.
+To launch the first stage training, run the following command. In our experiments, we use 4 A100.
+You can change the save path in the config file
+[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml)
+
+```bash
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml
+```
+
+A MiniGPT-4 checkpoint with only stage one training can be downloaded
+[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link).
+Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently.
+
+
+**2. Second finetuning stage**
+
+In the second stage, we use a small high quality image-text pair dataset created by ourselves
+and convert it to a conversation format to further align MiniGPT-4.
+To download and prepare our second stage dataset, please check our
+[second stage dataset preparation instruction](dataset/README_2_STAGE.md).
+To launch the second stage alignment,
+first specify the path to the checkpoint file trained in stage 1 in
+[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml).
+You can also specify the output path there.
+Then, run the following command. In our experiments, we use 1 A100.
+
+```bash
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml
+```
+
+After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly.
+
+
+
+
+## Acknowledgement
+
++ [BLIP2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) The model architecture of MiniGPT-4 follows BLIP-2. Don't forget to check this great open-source work if you don't know it before!
++ [Lavis](https://github.com/salesforce/LAVIS) This repository is built upon Lavis!
++ [Vicuna](https://github.com/lm-sys/FastChat) The fantastic language ability of Vicuna with only 13B parameters is just amazing. And it is open-source!
+
+
+If you're using MiniGPT-4 in your research or applications, please cite using this BibTeX:
+```bibtex
+@article{zhu2023minigpt,
+ title={MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models},
+ author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed},
+ journal={arXiv preprint arXiv:2304.10592},
+ year={2023}
+}
+```
+
+
+## License
+This repository is under [BSD 3-Clause License](LICENSE.md).
+Many codes are based on [Lavis](https://github.com/salesforce/LAVIS) with
+BSD 3-Clause License [here](LICENSE_Lavis.md).
diff --git a/demo.py b/demo.py
index b3659f1..2c5927d 100644
--- a/demo.py
+++ b/demo.py
@@ -28,8 +28,8 @@ def parse_args():
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
- "in xxx=yyy format will be merged into config file (deprecate), "
- "change to --cfg-options instead.",
+ "in xxx=yyy format will be merged into config file (deprecate), "
+ "change to --cfg-options instead.",
)
args = parser.parse_args()
return args
@@ -64,6 +64,7 @@ vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config
chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
print('Initialization Finished')
+
# ========================================
# Gradio Setting
# ========================================
@@ -73,7 +74,10 @@ def gradio_reset(chat_state, img_list):
chat_state.messages = []
if img_list is not None:
img_list = []
- return None, gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your image first', interactive=False),gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_list
+ return None, gr.update(value=None, interactive=True), gr.update(placeholder="chat with me",
+ interactive=True), gr.update(
+ value="Upload & Start Chat", interactive=True), chat_state, img_list
+
def upload_img(gr_img, text_input, chat_state):
if gr_img is None:
@@ -81,11 +85,16 @@ def upload_img(gr_img, text_input, chat_state):
chat_state = CONV_VISION.copy()
img_list = []
llm_message = chat.upload_img(gr_img, chat_state, img_list)
- return gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list
+ return gr.update(interactive=True), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(
+ value="Upload img", interactive=True), chat_state, img_list
+
def gradio_ask(user_message, chatbot, chat_state):
if len(user_message) == 0:
return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
+ # chat_state = CONV_VISION.copy()
+ if chat_state == None:
+ chat_state = CONV_VISION.copy()
chat.ask(user_message, chat_state)
chatbot = chatbot + [[user_message, None]]
return '', chatbot, chat_state
@@ -101,12 +110,13 @@ def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
chatbot[-1][1] = llm_message
return chatbot, chat_state, img_list
+
title = """
Demo of MiniGPT-4
"""
description = """
This is the demo of MiniGPT-4. Upload your images and start chatting!
"""
article = """
"""
-#TODO show examples below
+# TODO show examples below
with gr.Blocks() as demo:
gr.Markdown(title)
@@ -118,7 +128,7 @@ with gr.Blocks() as demo:
image = gr.Image(type="pil")
upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
clear = gr.Button("Restart")
-
+
num_beams = gr.Slider(
minimum=1,
maximum=10,
@@ -127,7 +137,7 @@ with gr.Blocks() as demo:
interactive=True,
label="beam search numbers)",
)
-
+
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
@@ -141,13 +151,15 @@ with gr.Blocks() as demo:
chat_state = gr.State()
img_list = gr.State()
chatbot = gr.Chatbot(label='MiniGPT-4')
- text_input = gr.Textbox(label='User', placeholder='Please upload your image first', interactive=False)
-
- upload_button.click(upload_img, [image, text_input, chat_state], [image, text_input, upload_button, chat_state, img_list])
-
+ text_input = gr.Textbox(label='User', placeholder='chat with me', interactive=True)
+
+ upload_button.click(upload_img, [image, text_input, chat_state],
+ [image, text_input, upload_button, chat_state, img_list])
+ # print(img_list)
text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
)
- clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list], queue=False)
+ clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list],
+ queue=False)
-demo.launch(share=True, enable_queue=True)
+demo.launch(server_name="0.0.0.0", server_port=7778, share=True, enable_queue=True)
diff --git a/examples/e5b0d467fa14e2aa9b77a46b828a4e0.png b/examples/e5b0d467fa14e2aa9b77a46b828a4e0.png
new file mode 100644
index 0000000..a27f769
Binary files /dev/null and b/examples/e5b0d467fa14e2aa9b77a46b828a4e0.png differ
diff --git a/minigpt4/conversation/conversation.py b/minigpt4/conversation/conversation.py
index 676d89f..518d29d 100644
--- a/minigpt4/conversation/conversation.py
+++ b/minigpt4/conversation/conversation.py
@@ -117,7 +117,6 @@ CONV_VISION = Conversation(
)
-
class Chat:
def __init__(self, model, vis_processor, device='cuda:0'):
self.device = device
@@ -131,6 +130,8 @@ class Chat:
if len(conv.messages) > 0 and conv.messages[-1][0] == conv.roles[0] \
and conv.messages[-1][1][-6:] == '': # last message is image.
conv.messages[-1][1] = ' '.join([conv.messages[-1][1], text])
+ elif len(conv.messages) == 0:
+ conv.append_message(conv.roles[0], " " + text)
else:
conv.append_message(conv.roles[0], text)
@@ -181,7 +182,7 @@ class Chat:
if len(image.shape) == 3:
image = image.unsqueeze(0)
image = image.to(self.device)
-
+ print(image.shape)
image_emb, _ = self.model.encode_img(image)
img_list.append(image_emb)
conv.append_message(conv.roles[0], "")
@@ -189,9 +190,16 @@ class Chat:
# self.conv.append_message(self.conv.roles[1], msg)
return msg
- def get_context_emb(self, conv, img_list):
+ def get_context_emb(self, conv, img_list=None):
+ if img_list is None:
+ img = torch.zeros((1, 3, 224, 224)).to(self.device)
+ image_emb, _ = self.model.encode_img(img)
+ img_list = [image_emb]
+
prompt = conv.get_prompt()
+ print(prompt)
prompt_segs = prompt.split('')
+
assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
seg_tokens = [
self.model.llama_tokenizer(
@@ -200,6 +208,7 @@ class Chat:
for i, seg in enumerate(prompt_segs)
]
seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
+
mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
mixed_embs = torch.cat(mixed_embs, dim=1)
return mixed_embs