update text direct conversation

2025-04-05 02:20:47 +00:00 · 2023-05-12 19:04:53 +08:00 · 2023-05-12 19:04:53 +08:00 · 53da7fa871
commit 53da7fa871
parent 22d8888ca2
12 changed files with 500 additions and 158 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,3 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
--- a/.idea/MiniGPT-4.iml
+++ b/.idea/MiniGPT-4.iml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -0,0 +1,78 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyCompatibilityInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ourVersions">
+        <value>
+          <list size="4">
+            <item index="0" class="java.lang.String" itemvalue="3.6" />
+            <item index="1" class="java.lang.String" itemvalue="3.7" />
+            <item index="2" class="java.lang.String" itemvalue="3.8" />
+            <item index="3" class="java.lang.String" itemvalue="3.11" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="29">
+            <item index="0" class="java.lang.String" itemvalue="object_detection" />
+            <item index="1" class="java.lang.String" itemvalue="tensorflow_gpu" />
+            <item index="2" class="java.lang.String" itemvalue="keras_nightly" />
+            <item index="3" class="java.lang.String" itemvalue="torch" />
+            <item index="4" class="java.lang.String" itemvalue="torchvision" />
+            <item index="5" class="java.lang.String" itemvalue="opencv_python" />
+            <item index="6" class="java.lang.String" itemvalue="scipy" />
+            <item index="7" class="java.lang.String" itemvalue="thop" />
+            <item index="8" class="java.lang.String" itemvalue="opencv-python" />
+            <item index="9" class="java.lang.String" itemvalue="PyYAML" />
+            <item index="10" class="java.lang.String" itemvalue="ipython" />
+            <item index="11" class="java.lang.String" itemvalue="numpy" />
+            <item index="12" class="java.lang.String" itemvalue="requests" />
+            <item index="13" class="java.lang.String" itemvalue="psutil" />
+            <item index="14" class="java.lang.String" itemvalue="tqdm" />
+            <item index="15" class="java.lang.String" itemvalue="pandas" />
+            <item index="16" class="java.lang.String" itemvalue="tensorboard" />
+            <item index="17" class="java.lang.String" itemvalue="seaborn" />
+            <item index="18" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="19" class="java.lang.String" itemvalue="Pillow" />
+            <item index="20" class="java.lang.String" itemvalue="shapely" />
+            <item index="21" class="java.lang.String" itemvalue="motmetrics" />
+            <item index="22" class="java.lang.String" itemvalue="sklearn" />
+            <item index="23" class="java.lang.String" itemvalue="setuptools" />
+            <item index="24" class="java.lang.String" itemvalue="lap" />
+            <item index="25" class="java.lang.String" itemvalue="Cython" />
+            <item index="26" class="java.lang.String" itemvalue="pycocotools" />
+            <item index="27" class="java.lang.String" itemvalue="terminaltables" />
+            <item index="28" class="java.lang.String" itemvalue="openpyxl" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N803" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyStubPackagesAdvertiser" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <list>
+          <option value="PyQt5-stubs==5.15.6.0" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="torch.cuda.amp" />
+          <option value="torch.utils.tensorboard" />
+          <option value="请求网页.*" />
+          <option value="tools.export_model.*" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/MiniGPT-4.iml" filepath="$PROJECT_DIR$/.idea/MiniGPT-4.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/README.md
+++ b/README.md
@ -1,65 +1,91 @@
-# MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models
-[Deyao Zhu](https://tsutikgiau.github.io/)* (On Job Market!), [Jun Chen](https://junchen14.github.io/)* (On Job Market!), [Xiaoqian Shen](https://xiaoqian-shen.github.io), [Xiang Li](https://xiangli.ac.cn), and [Mohamed Elhoseiny](https://www.mohamed-elhoseiny.com/). *Equal Contribution
+# miniGPT4 
+[Chineses version](README_cn.md) <p>🚀🚀</p> 
+This project has modified demo.py and conversation.py to support direct text conversation without the need to upload an image first.

-**King Abdullah University of Science and Technology**
+**TODO: Support multiple images uploading.**

-<a href='https://minigpt-4.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a>  <a href='https://arxiv.org/abs/2304.10592'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a> <a href='https://huggingface.co/spaces/Vision-CAIR/minigpt4'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> <a href='https://huggingface.co/Vision-CAIR/MiniGPT-4'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a> [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://www.youtube.com/watch?v=__tftoxpBAw&feature=youtu.be)
+exmple：
+![show](./examples/e5b0d467fa14e2aa9b77a46b828a4e0.png)

+The following is the process for configuring the project environment. If you have already completed this step, you can skip the environment configuration and directly run demo.py.

-## News
-We now provide a pretrained MiniGPT-4 aligned with Vicuna-7B! The demo GPU memory consumption now can be as low as 12GB.
+[Offcial repo](https://github.com/Vision-CAIR/MiniGPT-4)
+supports two pretrained weights [13B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view) and [7B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view) can be used fintuning 

+**GPU Memory needed** 

-## Online Demo
+* 13B： 23G
+* 7B: 11.5G

-Click the image to chat with MiniGPT-4 around your images
-[![demo](figs/online_demo.png)](https://minigpt-4.github.io)
+### Getting start
+**0.Environment setup**

+make sure you install docker first

-## Examples
-  |   |   |
-:-------------------------:|:-------------------------:
-![find wild](figs/examples/wop_2.png) |  ![write story](figs/examples/ad_2.png)
-![solve problem](figs/examples/fix_1.png)  |  ![write Poem](figs/examples/rhyme_1.png)
+pull up [docker image](https://hub.docker.com/r/bewithmeallmylife/mini-gpt4-runtime-cuda-10.2)
+```commandline
+docker pull bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0
+```
+Build the container, expose the corresponding port to enable launching the frontend UI for local use.
+```commandline
+nvidia-docker run -v /data:/projects -v /data2:/data2 -p 1118:7778 --shm-size 8G --name minigpt4 -d bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0 tail -f /dev/null
+```
+Get into docker
+```commandline
+docker exec -it minigpt4 bash
+```
+conda environment **mini-gpt4** already have
+```commandline
+conda activate mini-gpt4
+```
+note: make sure torch is compatible with your cuda version, in this image, the torch version is 1.12.1+cu10.2
+if your cuda is above 11, then execute this step.

-More examples can be found in the [project page](https://minigpt-4.github.io).
-
-
-
-## Introduction
- MiniGPT-4 aligns a frozen visual encoder from BLIP-2 with a frozen LLM, Vicuna, using just one projection layer. 
- We train MiniGPT-4 with two stages. The first traditional pretraining stage is trained using roughly 5 million aligned image-text pairs in 10 hours using 4 A100s. After the first stage, Vicuna is able to understand the image. But the generation ability of Vicuna is heavilly impacted.
- To address this issue and improve usability, we propose a novel way to create high-quality image-text pairs by the model itself and ChatGPT together. Based on this, we then create a small (3500 pairs in total) yet high-quality dataset.
- The second finetuning stage is trained on this dataset in a conversation template to significantly improve its generation reliability and overall usability. To our surprise, this stage is computationally efficient and takes only around 7 minutes with a single A100.
- MiniGPT-4 yields many emerging vision-language capabilities similar to those demonstrated in GPT-4. 
-
-
-![overview](figs/overview.png)
-
-
-## Getting Started
-### Installation
-
-**1. Prepare the code and the environment**
-
-Git clone our repository, creating a python environment and ativate it via the following command
-
-```bash
-git clone https://github.com/Vision-CAIR/MiniGPT-4.git
-cd MiniGPT-4
-conda env create -f environment.yml
-conda activate minigpt4
+The inference environment required for miniGPT4 is already included in the image, with PyTorch version 1.12.1+cu10.2, which does not support the sm86 architecture. If the GPU model is RTX A6000 with an architecture of 8.6, a version that supports this architecture such as torch1.12.1+cu11.3 needs to be reinstalled.
+```commandline
+conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
 ```

+**1.Prepare the pretrained model weights.**

-**2. Prepare the pretrained Vicuna weights**
-
-The current version of MiniGPT-4 is built on the v0 versoin of Vicuna-13B.
-Please refer to our instruction [here](PrepareVicuna.md) 
-to prepare the Vicuna weights.
-The final weights would be in a single folder in a structure similar to the following:
+A total of three pretrained model weights need to be prepared: **vicuna** (7B/14G), **llama** (7B/12.5G), and **miniGPT4** (7B).
+* vicuna

+Pretrained weights in two sizes, 13B and 7B, are available for download.
+```commandline
+git lfs install
+git clone https://huggingface.co/lmsys/vicuna-13b-delta-v0  # more powerful, need at least 24G gpu memory
+# or
+git clone https://huggingface.co/lmsys/vicuna-7b-delta-v0  # smaller, need 12G gpu memory
 ```
+These two weights are not working, we still need llama pretrained wieght.
+* llama
+
+To download the llama weights, which are not officially available for download, a third-party download form is used. Choose between 7B or 13B.
+```commandline
+wget https://agi.gpt4.org/llama/LLaMA/tokenizer.model -O ./tokenizer.model
+wget https://agi.gpt4.org/llama/LLaMA/tokenizer_checklist.chk -O ./tokenizer_checklist.chk
+wget https://agi.gpt4.org/llama/LLaMA/7B/consolidated.00.pth -O ./7B/consolidated.00.pth
+wget https://agi.gpt4.org/llama/LLaMA/7B/params.json -O ./7B/params.json
+wget https://agi.gpt4.org/llama/LLaMA/7B/checklist.chk -O ./7B/checklist.chk
+wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.00.pth -O ./13B/consolidated.00.pth
+wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.01.pth -O ./13B/consolidated.01.pth
+wget https://agi.gpt4.org/llama/LLaMA/13B/params.json -O ./13B/params.json
+wget https://agi.gpt4.org/llama/LLaMA/13B/checklist.chk -O ./13B/checklist.chk
+```
+After downloading llama, we need to transform it to Huggingface format
+```commandline
+git clone https://github.com/huggingface/transformers.git
+python transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path/to/llama-13bOR7b-hf/
+```
+After both the vicuna and llama weights are prepared, they need to be combined to obtain the vicuna weights that can be used.
+```commandline
+pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10
+python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/  --target /path/to/save/working/vicuna/weight/  --delta /path/to/vicuna-13bOR7b-delta-v0/
+```
+Finally get a working weight, the weight folder is like below：
+```commandline
 vicuna_weights
 ├── config.json
 ├── generation_config.json
@ -67,104 +93,16 @@ vicuna_weights
 ├── pytorch_model-00001-of-00003.bin
 ...   
 ```
+Then, set the path to the vicuna weight in the model config file [here](minigpt4/configs/models/minigpt4.yaml) at Line 16.
+* minigpt4 checkpoints

-Then, set the path to the vicuna weight in the model config file 
-[here](minigpt4/configs/models/minigpt4.yaml#L16) at Line 16.
+[13B checkpoint](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link)
+[7B checkpoint](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)

-**3. Prepare the pretrained MiniGPT-4 checkpoint**
-
-Download the pretrained checkpoints according to the Vicuna model you prepare.
-
-|                                Checkpoint Aligned with Vicuna 13B                                |                               Checkpoint Aligned with Vicuna 7B                                |
-:------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:
- [Downlad](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) 
+Then, set the path to the pretrained checkpoint in the evaluation config file in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) at Line 11.


-Then, set the path to the pretrained checkpoint in the evaluation config file 
-in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 11. 
-
-
-
-### Launching Demo Locally
-
-Try out our demo [demo.py](demo.py) on your local machine by running
-
-```
+**2.run demo.py**
+```commandline
 python demo.py --cfg-path eval_configs/minigpt4_eval.yaml  --gpu-id 0
 ```
-
-To save GPU memory, Vicuna loads as 8 bit by default, with a beam search width of 1. 
-This configuration requires about 23G GPU memory for Vicuna 13B and 11.5G GPU memory for Vicuna 7B. 
-For more powerful GPUs, you can run the model
-in 16 bit by setting low_resource to False in the config file 
-[minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) and use a larger beam search width.
-
-Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run our code on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing)
-
-
-### Training
-The training of MiniGPT-4 contains two alignment stages.
-
-**1. First pretraining stage**
-
-In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets
-to align the vision and language model. To download and prepare the datasets, please check 
-our [first stage dataset preparation instruction](dataset/README_1_STAGE.md). 
-After the first stage, the visual features are mapped and can be understood by the language
-model.
-To launch the first stage training, run the following command. In our experiments, we use 4 A100. 
-You can change the save path in the config file 
-[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml)
-
-```bash
-torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml
-```
-
-A MiniGPT-4 checkpoint with only stage one training can be downloaded 
-[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link).
-Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently.
-
-
-**2. Second finetuning stage**
-
-In the second stage, we use a small high quality image-text pair dataset created by ourselves
-and convert it to a conversation format to further align MiniGPT-4.
-To download and prepare our second stage dataset, please check our 
-[second stage dataset preparation instruction](dataset/README_2_STAGE.md).
-To launch the second stage alignment, 
-first specify the path to the checkpoint file trained in stage 1 in 
-[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml).
-You can also specify the output path there. 
-Then, run the following command. In our experiments, we use 1 A100.
-
-```bash
-torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml
-```
-
-After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly. 
-
-
-
-
-## Acknowledgement
-
-+ [BLIP2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) The model architecture of MiniGPT-4 follows BLIP-2. Don't forget to check this great open-source work if you don't know it before!
-+ [Lavis](https://github.com/salesforce/LAVIS) This repository is built upon Lavis!
-+ [Vicuna](https://github.com/lm-sys/FastChat) The fantastic language ability of Vicuna with only 13B parameters is just amazing. And it is open-source!
-
-
-If you're using MiniGPT-4 in your research or applications, please cite using this BibTeX:
-```bibtex
-@article{zhu2023minigpt,
-  title={MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models},
-  author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed},
-  journal={arXiv preprint arXiv:2304.10592},
-  year={2023}
-}
-```
-
-
-## License
-This repository is under [BSD 3-Clause License](LICENSE.md).
-Many codes are based on [Lavis](https://github.com/salesforce/LAVIS) with 
-BSD 3-Clause License [here](LICENSE_Lavis.md).
--- a/README_cn.md
+++ b/README_cn.md
@ -0,0 +1,100 @@
+
+## miniGPT4 <p>🚀🚀</p>
+本项目修改了demo.py和conversation.py，能够支持直接文本对话，而无需先上传图片。
+
+**TODO: 支持多图上传回答**
+
+演示：
+![show](./examples/e5b0d467fa14e2aa9b77a46b828a4e0.png)
+
+以下是项目的环境配置过程，如果你已经配好了，跳过环境配置的阶段，直接运行demo.py即可
+
+[官方](https://github.com/Vision-CAIR/MiniGPT-4)
+提供参数量为[13B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view)和[7B](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view)的checkpoint可供微调 
+
+**所需配置** 
+
+官方使用A100显卡
+* 13B： 23G显存
+* 7B: 11.5G显存
+
+### 步骤
+**0.环境准备**
+
+拉取已有docker[镜像](https://hub.docker.com/r/bewithmeallmylife/mini-gpt4-runtime-cuda-10.2)
+```commandline
+docker pull bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0
+```
+构建容器, 暴露对应端口，以便启动前端ui在本地使用
+```commandline
+nvidia-docker run -v /data:/projects -v /data2:/data2 -p 1118:7778 --shm-size 8G --name minigpt4 -d bewithmeallmylife/mini-gpt4-runtime-cuda-10.2:1.0.0 tail -f /dev/null
+```
+进入容器
+```commandline
+docker exec -it minigpt4 bash
+```
+启动conda虚拟环境**mini-gpt4**
+```commandline
+conda activate mini-gpt4
+```
+该镜像中miniGPT4所需的推理环境已有，pytorch版本为1.12.1+cu10.2，并不支持sm86的算力，如果显卡型号为RTX A6000，算力为8.6，需重新安装支持该算力的版本如torch1.12.1+cu11.3 
+```commandline
+conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
+```
+
+**1.准备预训练的模型权重**
+
+一共需要准备3个预训练的模型权重 vicuna（7B/14G），llama（7B/12.5G），和miniGPT4（7B）
+* vicuna
+可下载13B和7B两种大小的预训练权重
+```commandline
+git lfs install
+git clone https://huggingface.co/lmsys/vicuna-13b-delta-v0  # more powerful, need at least 24G gpu memory
+# or
+git clone https://huggingface.co/lmsys/vicuna-7b-delta-v0  # smaller, need 12G gpu memory
+```
+这两个权重还并非可用的，需搭配llama权重激活使用
+* 下载llama权重如下，官方并未开放下载，采用第三方下载形式, 选择7B或13B
+```commandline
+wget https://agi.gpt4.org/llama/LLaMA/tokenizer.model -O ./tokenizer.model
+wget https://agi.gpt4.org/llama/LLaMA/tokenizer_checklist.chk -O ./tokenizer_checklist.chk
+wget https://agi.gpt4.org/llama/LLaMA/7B/consolidated.00.pth -O ./7B/consolidated.00.pth
+wget https://agi.gpt4.org/llama/LLaMA/7B/params.json -O ./7B/params.json
+wget https://agi.gpt4.org/llama/LLaMA/7B/checklist.chk -O ./7B/checklist.chk
+wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.00.pth -O ./13B/consolidated.00.pth
+wget https://agi.gpt4.org/llama/LLaMA/13B/consolidated.01.pth -O ./13B/consolidated.01.pth
+wget https://agi.gpt4.org/llama/LLaMA/13B/params.json -O ./13B/params.json
+wget https://agi.gpt4.org/llama/LLaMA/13B/checklist.chk -O ./13B/checklist.chk
+```
+下载完llama权重之后，还需要转换成huggingface的模型格式
+```commandline
+git clone https://github.com/huggingface/transformers.git
+python transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path/to/llama-13bOR7b-hf/
+```
+当vicuna和llama两个权重都准备好了之后，还需要将它们组合在一起得到能够使用得vicuna权重
+```commandline
+pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10
+python -m fastchat.model.apply_delta --base /path/to/llama-13bOR7b-hf/  --target /path/to/save/working/vicuna/weight/  --delta /path/to/vicuna-13bOR7b-delta-v0/
+```
+最终获得一个可以使用的权重，它的文件格式如下：
+```commandline
+vicuna_weights
+├── config.json
+├── generation_config.json
+├── pytorch_model.bin.index.json
+├── pytorch_model-00001-of-00003.bin
+...   
+```
+将该权重文件的路径添加到配置文件minigpt4/configs/models/minigpt4.yaml的第16行
+* minigpt4预训练权重下载
+
+[13B的checkpoint](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link)
+[7B的checkpoint](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing)
+
+将下好的权重路径加到配置文件eval_configs/minigpt4_eval.yaml的第11行
+
+**2.运行demo.py**
+```commandline
+python demo.py --cfg-path eval_configs/minigpt4_eval.yaml  --gpu-id 0
+```
--- a/README_official.md
+++ b/README_official.md
@ -0,0 +1,170 @@
+# MiniGPT-4: Enhancing Vision-language Understanding with Advanced Large Language Models
+[Deyao Zhu](https://tsutikgiau.github.io/)* (On Job Market!), [Jun Chen](https://junchen14.github.io/)* (On Job Market!), [Xiaoqian Shen](https://xiaoqian-shen.github.io), [Xiang Li](https://xiangli.ac.cn), and [Mohamed Elhoseiny](https://www.mohamed-elhoseiny.com/). *Equal Contribution
+
+**King Abdullah University of Science and Technology**
+
+<a href='https://minigpt-4.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a>  <a href='https://arxiv.org/abs/2304.10592'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a> <a href='https://huggingface.co/spaces/Vision-CAIR/minigpt4'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a> <a href='https://huggingface.co/Vision-CAIR/MiniGPT-4'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a> [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing) [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://www.youtube.com/watch?v=__tftoxpBAw&feature=youtu.be)
+
+
+## News
+We now provide a pretrained MiniGPT-4 aligned with Vicuna-7B! The demo GPU memory consumption now can be as low as 12GB.
+
+
+## Online Demo
+
+Click the image to chat with MiniGPT-4 around your images
+[![demo](figs/online_demo.png)](https://minigpt-4.github.io)
+
+
+## Examples
+  |   |   |
+:-------------------------:|:-------------------------:
+![find wild](figs/examples/wop_2.png) |  ![write story](figs/examples/ad_2.png)
+![solve problem](figs/examples/fix_1.png)  |  ![write Poem](figs/examples/rhyme_1.png)
+
+More examples can be found in the [project page](https://minigpt-4.github.io).
+
+
+
+## Introduction
+- MiniGPT-4 aligns a frozen visual encoder from BLIP-2 with a frozen LLM, Vicuna, using just one projection layer. 
+- We train MiniGPT-4 with two stages. The first traditional pretraining stage is trained using roughly 5 million aligned image-text pairs in 10 hours using 4 A100s. After the first stage, Vicuna is able to understand the image. But the generation ability of Vicuna is heavilly impacted.
+- To address this issue and improve usability, we propose a novel way to create high-quality image-text pairs by the model itself and ChatGPT together. Based on this, we then create a small (3500 pairs in total) yet high-quality dataset.
+- The second finetuning stage is trained on this dataset in a conversation template to significantly improve its generation reliability and overall usability. To our surprise, this stage is computationally efficient and takes only around 7 minutes with a single A100.
+- MiniGPT-4 yields many emerging vision-language capabilities similar to those demonstrated in GPT-4. 
+
+
+![overview](figs/overview.png)
+
+
+## Getting Started
+### Installation
+
+**1. Prepare the code and the environment**
+
+Git clone our repository, creating a python environment and ativate it via the following command
+
+```bash
+git clone https://github.com/Vision-CAIR/MiniGPT-4.git
+cd MiniGPT-4
+conda env create -f environment.yml
+conda activate minigpt4
+```
+
+
+**2. Prepare the pretrained Vicuna weights**
+
+The current version of MiniGPT-4 is built on the v0 versoin of Vicuna-13B.
+Please refer to our instruction [here](PrepareVicuna.md) 
+to prepare the Vicuna weights.
+The final weights would be in a single folder in a structure similar to the following:
+
+```
+vicuna_weights
+├── config.json
+├── generation_config.json
+├── pytorch_model.bin.index.json
+├── pytorch_model-00001-of-00003.bin
+...   
+```
+
+Then, set the path to the vicuna weight in the model config file 
+[here](minigpt4/configs/models/minigpt4.yaml#L16) at Line 16.
+
+**3. Prepare the pretrained MiniGPT-4 checkpoint**
+
+Download the pretrained checkpoints according to the Vicuna model you prepare.
+
+|                                Checkpoint Aligned with Vicuna 13B                                |                               Checkpoint Aligned with Vicuna 7B                                |
+:------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:
+ [Downlad](https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view?usp=share_link) | [Download](https://drive.google.com/file/d/1RY9jV0dyqLX-o38LrumkKRh6Jtaop58R/view?usp=sharing) 
+
+
+Then, set the path to the pretrained checkpoint in the evaluation config file 
+in [eval_configs/minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml#L10) at Line 11. 
+
+
+
+### Launching Demo Locally
+
+Try out our demo [demo.py](demo.py) on your local machine by running
+
+```
+python demo.py --cfg-path eval_configs/minigpt4_eval.yaml  --gpu-id 0
+```
+
+To save GPU memory, Vicuna loads as 8 bit by default, with a beam search width of 1. 
+This configuration requires about 23G GPU memory for Vicuna 13B and 11.5G GPU memory for Vicuna 7B. 
+For more powerful GPUs, you can run the model
+in 16 bit by setting low_resource to False in the config file 
+[minigpt4_eval.yaml](eval_configs/minigpt4_eval.yaml) and use a larger beam search width.
+
+Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run our code on [Colab](https://colab.research.google.com/drive/1OK4kYsZphwt5DXchKkzMBjYF6jnkqh4R?usp=sharing)
+
+
+### Training
+The training of MiniGPT-4 contains two alignment stages.
+
+**1. First pretraining stage**
+
+In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets
+to align the vision and language model. To download and prepare the datasets, please check 
+our [first stage dataset preparation instruction](dataset/README_1_STAGE.md). 
+After the first stage, the visual features are mapped and can be understood by the language
+model.
+To launch the first stage training, run the following command. In our experiments, we use 4 A100. 
+You can change the save path in the config file 
+[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml)
+
+```bash
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml
+```
+
+A MiniGPT-4 checkpoint with only stage one training can be downloaded 
+[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link).
+Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently.
+
+
+**2. Second finetuning stage**
+
+In the second stage, we use a small high quality image-text pair dataset created by ourselves
+and convert it to a conversation format to further align MiniGPT-4.
+To download and prepare our second stage dataset, please check our 
+[second stage dataset preparation instruction](dataset/README_2_STAGE.md).
+To launch the second stage alignment, 
+first specify the path to the checkpoint file trained in stage 1 in 
+[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage2_finetune.yaml).
+You can also specify the output path there. 
+Then, run the following command. In our experiments, we use 1 A100.
+
+```bash
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage2_finetune.yaml
+```
+
+After the second stage alignment, MiniGPT-4 is able to talk about the image coherently and user-friendly. 
+
+
+
+
+## Acknowledgement
+
+ [BLIP2](https://huggingface.co/docs/transformers/main/model_doc/blip-2) The model architecture of MiniGPT-4 follows BLIP-2. Don't forget to check this great open-source work if you don't know it before!
+ [Lavis](https://github.com/salesforce/LAVIS) This repository is built upon Lavis!
+ [Vicuna](https://github.com/lm-sys/FastChat) The fantastic language ability of Vicuna with only 13B parameters is just amazing. And it is open-source!
+
+
+If you're using MiniGPT-4 in your research or applications, please cite using this BibTeX:
+```bibtex
+@article{zhu2023minigpt,
+  title={MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models},
+  author={Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed},
+  journal={arXiv preprint arXiv:2304.10592},
+  year={2023}
+}
+```
+
+
+## License
+This repository is under [BSD 3-Clause License](LICENSE.md).
+Many codes are based on [Lavis](https://github.com/salesforce/LAVIS) with 
+BSD 3-Clause License [here](LICENSE_Lavis.md).
--- a/demo.py
+++ b/demo.py
@ -28,8 +28,8 @@ def parse_args():
        "--options",
        nargs="+",
        help="override some settings in the used config, the key-value pair "
-        "in xxx=yyy format will be merged into config file (deprecate), "
-        "change to --cfg-options instead.",
+             "in xxx=yyy format will be merged into config file (deprecate), "
+             "change to --cfg-options instead.",
    )
    args = parser.parse_args()
    return args
@ -64,6 +64,7 @@ vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config
 chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
 print('Initialization Finished')

+
 # ========================================
 #             Gradio Setting
 # ========================================
@ -73,7 +74,10 @@ def gradio_reset(chat_state, img_list):
        chat_state.messages = []
    if img_list is not None:
        img_list = []
-    return None, gr.update(value=None, interactive=True), gr.update(placeholder='Please upload your image first', interactive=False),gr.update(value="Upload & Start Chat", interactive=True), chat_state, img_list
+    return None, gr.update(value=None, interactive=True), gr.update(placeholder="chat with me",
+                                                                    interactive=True), gr.update(
+        value="Upload & Start Chat", interactive=True), chat_state, img_list
+

 def upload_img(gr_img, text_input, chat_state):
    if gr_img is None:
@ -81,11 +85,16 @@ def upload_img(gr_img, text_input, chat_state):
    chat_state = CONV_VISION.copy()
    img_list = []
    llm_message = chat.upload_img(gr_img, chat_state, img_list)
-    return gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, img_list
+    return gr.update(interactive=True), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(
+        value="Upload img", interactive=True), chat_state, img_list
+

 def gradio_ask(user_message, chatbot, chat_state):
    if len(user_message) == 0:
        return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
+    #     chat_state = CONV_VISION.copy()
+    if chat_state == None:
+        chat_state = CONV_VISION.copy()
    chat.ask(user_message, chat_state)
    chatbot = chatbot + [[user_message, None]]
    return '', chatbot, chat_state
@ -101,12 +110,13 @@ def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
    chatbot[-1][1] = llm_message
    return chatbot, chat_state, img_list

+
 title = """<h1 align="center">Demo of MiniGPT-4</h1>"""
 description = """<h3>This is the demo of MiniGPT-4. Upload your images and start chatting!</h3>"""
 article = """<p><a href='https://minigpt-4.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/Vision-CAIR/MiniGPT-4'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://raw.githubusercontent.com/Vision-CAIR/MiniGPT-4/main/MiniGPT_4.pdf'><img src='https://img.shields.io/badge/Paper-PDF-red'></a></p>
 """

-#TODO show examples below
+# TODO show examples below

 with gr.Blocks() as demo:
    gr.Markdown(title)
@ -118,7 +128,7 @@ with gr.Blocks() as demo:
            image = gr.Image(type="pil")
            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
            clear = gr.Button("Restart")
-            
+
            num_beams = gr.Slider(
                minimum=1,
                maximum=10,
@ -127,7 +137,7 @@ with gr.Blocks() as demo:
                interactive=True,
                label="beam search numbers)",
            )
-            
+
            temperature = gr.Slider(
                minimum=0.1,
                maximum=2.0,
@ -141,13 +151,15 @@ with gr.Blocks() as demo:
            chat_state = gr.State()
            img_list = gr.State()
            chatbot = gr.Chatbot(label='MiniGPT-4')
-            text_input = gr.Textbox(label='User', placeholder='Please upload your image first', interactive=False)
-    
-    upload_button.click(upload_img, [image, text_input, chat_state], [image, text_input, upload_button, chat_state, img_list])
-    
+            text_input = gr.Textbox(label='User', placeholder='chat with me', interactive=True)
+
+    upload_button.click(upload_img, [image, text_input, chat_state],
+                        [image, text_input, upload_button, chat_state, img_list])
+    #     print(img_list)
    text_input.submit(gradio_ask, [text_input, chatbot, chat_state], [text_input, chatbot, chat_state]).then(
        gradio_answer, [chatbot, chat_state, img_list, num_beams, temperature], [chatbot, chat_state, img_list]
    )
-    clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list], queue=False)
+    clear.click(gradio_reset, [chat_state, img_list], [chatbot, image, text_input, upload_button, chat_state, img_list],
+                queue=False)

-demo.launch(share=True, enable_queue=True)
+demo.launch(server_name="0.0.0.0", server_port=7778, share=True, enable_queue=True)
--- a/examples/e5b0d467fa14e2aa9b77a46b828a4e0.png
+++ b/examples/e5b0d467fa14e2aa9b77a46b828a4e0.png
--- a/minigpt4/conversation/conversation.py
+++ b/minigpt4/conversation/conversation.py
@ -117,7 +117,6 @@ CONV_VISION = Conversation(
 )


-
 class Chat:
    def __init__(self, model, vis_processor, device='cuda:0'):
        self.device = device
@ -131,6 +130,8 @@ class Chat:
        if len(conv.messages) > 0 and conv.messages[-1][0] == conv.roles[0] \
                and conv.messages[-1][1][-6:] == '</Img>':  # last message is image.
            conv.messages[-1][1] = ' '.join([conv.messages[-1][1], text])
+        elif len(conv.messages) == 0:
+            conv.append_message(conv.roles[0], " <Img><ImageHere></Img> " + text)
        else:
            conv.append_message(conv.roles[0], text)

@ -181,7 +182,7 @@ class Chat:
            if len(image.shape) == 3:
                image = image.unsqueeze(0)
            image = image.to(self.device)
-
+        print(image.shape)
        image_emb, _ = self.model.encode_img(image)
        img_list.append(image_emb)
        conv.append_message(conv.roles[0], "<Img><ImageHere></Img>")
@ -189,9 +190,16 @@ class Chat:
        # self.conv.append_message(self.conv.roles[1], msg)
        return msg

-    def get_context_emb(self, conv, img_list):
+    def get_context_emb(self, conv, img_list=None):
+        if img_list is None:
+            img = torch.zeros((1, 3, 224, 224)).to(self.device)
+            image_emb, _ = self.model.encode_img(img)
+            img_list = [image_emb]
+
        prompt = conv.get_prompt()
+        print(prompt)
        prompt_segs = prompt.split('<ImageHere>')
+
        assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
        seg_tokens = [
            self.model.llama_tokenizer(
@ -200,6 +208,7 @@ class Chat:
            for i, seg in enumerate(prompt_segs)
        ]
        seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
+
        mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
        mixed_embs = torch.cat(mixed_embs, dim=1)
        return mixed_embs