update dataset readme

This commit is contained in:
junchen14 2023-10-25 07:52:44 +03:00
parent f976c7800f
commit 89878d661e
28 changed files with 114 additions and 185 deletions

2
.gitignore vendored
View File

@ -179,3 +179,5 @@ jobs/
*.slurm
slurm*
sbatch_generate*
eval_data/
dataset/Evaluation.md

22
MiniGPTv2_Train .md Normal file
View File

@ -0,0 +1,22 @@
## Finetune of MiniGPT-4
The training of MiniGPT-4 contains two alignment stages.
**1. First pretraining stage**
In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets
to align the vision and language model. To download and prepare the datasets, please check
our [first stage dataset preparation instruction](dataset/README_1_STAGE.md).
After the first stage, the visual features are mapped and can be understood by the language
model.
To launch the first stage training, run the following command. In our experiments, we use 4 A100.
You can change the save path in the config file
[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml)
```bash
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml
```
A MiniGPT-4 checkpoint with only stage one training can be downloaded
[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link).
Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently.

View File

@ -93,9 +93,10 @@ Then, set the variable *llama_model* in the model config file to the LLM weight
Download the pretrained model checkpoints
| MiniGPT-v2 (LLaMA-2 Chat 7B) |
|------------------------------|
| [Download](https://drive.google.com/file/d/1aVbfW7nkCSYx99_vCRyP1sOlQiWVSnAl/view?usp=sharing) |
| MiniGPT-v2 (developing model (online demo)) | MiniGPT-v2 (after stage-2) | MiniGPT-v2 (after stage-3)
|------------------------------|------------------------------|------------------------------|
| [Download](https://drive.google.com/file/d/1aVbfW7nkCSYx99_vCRyP1sOlQiWVSnAl/view?usp=sharing) |[Download](https://drive.google.com/file/d/1Vi_E7ZtZXRAQcyz4f8E6LtLh2UXABCmu/view?usp=sharing) |[Download](https://drive.google.com/file/d/1jAbxUiyl04SFJMN4sF1vvUU69Etuz4qa/view?usp=sharing) |
For **MiniGPT-v2**, set the path to the pretrained checkpoint in the evaluation config file
in [eval_configs/minigptv2_eval.yaml](eval_configs/minigptv2_eval.yaml#L10) at Line 8.
@ -146,6 +147,7 @@ Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run Mini
### Training
For training details of MiniGPT-4, check [here](MiniGPT4_Train.md).
For finetuning details of MiniGPT-v2, check [here](MiniGPTv2_Train.md)

View File

@ -26,11 +26,22 @@ LLaVA | <a href="https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/
### COCO captions
Download the COCO 2014 images and captions
coco 2014 images path
```
${MINIGPTv2_DATASET}
├── coco
│ ├── images
│ ...
...
```
coco caption annotation path
```
${MINIGPTv2_DATASET}
├── coco_captions
│ ├── coco_images
│ └── annotations
│ ├── coco_karpathy_train.json
│ ...

View File

@ -10,20 +10,11 @@ datasets:
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
# annotations:
# train:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
# storage:
# - /path/to/aokvqa/annotations/aokvqa_v1p0_train.json
# images:
# storage: /path/to/coco/images/
annotations:
train:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
storage:
- /ibex/project/c2090/minigptv2_dataset/aokvqa/aokvqa_v1p0_train.json
- /path/to/aokvqa_v1p0_train.json
images:
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/
storage: /path/to/coco/images

View File

@ -9,30 +9,13 @@ datasets:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
# build_info:
# # Be careful not to append minus sign (-) before split to avoid itemizing
# annotations:
# train:
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
# md5: aa31ac474cf6250ebb81d18348a07ed8
# storage: /path/to/coco_caption/annotations/coco_karpathy_train.json
# images:
# storage: /path/to/coco/images/
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
train:
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
md5: aa31ac474cf6250ebb81d18348a07ed8
storage: /ibex/project/c2090/minigptv2_dataset/coco_captions/coco_karpathy_train.json
# val:
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
# md5: b273847456ef5580e33713b1f7de52a0
# storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_val.json
# test:
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
# md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
# storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_test.json
storage: /path/to/coco_caption/coco_karpathy_train.json
images:
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
storage: /path/to/coco/images

View File

@ -10,24 +10,15 @@ datasets:
build_info:
# annotations:
# train:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
# storage:
# - /path/to/vqav2/annotations/vqa_train.json
# - /path/to/vqav2/coco/annotations/vqa_val.json
# images:
# storage: /path/to/coco/images/
annotations:
train:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
storage:
- /ibex/project/c2090/minigptv2_dataset/vqav2/vqa_train.json
- /ibex/project/c2090/minigptv2_dataset/vqav2/vqa_val.json
- /path/to/vqav2/vqa_train.json
- /path/to/vqav2/vqa_val.json
images:
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
storage: /path/to/coco/images

View File

@ -2,7 +2,7 @@ datasets:
invrefcoco:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
image_path: /path/to/coco/images
ann_path: /path/to/refcoco_annotations
dataset: invrefcoco
splitBy: unc

View File

@ -2,7 +2,7 @@ datasets:
invrefcocog:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
image_path: /path/to/coco/images
ann_path: /path/to/refcoco_annotations
dataset: invrefcocog
splitBy: umd

View File

@ -2,7 +2,7 @@ datasets:
invrefcocop:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
image_path: /path/to/coco/images
ann_path: /path/to/refcoco_annotations
dataset: invrefcoco+
splitBy: unc

View File

@ -2,7 +2,7 @@ datasets:
refcoco:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
image_path: /path/to/coco/images
ann_path: /path/to/refcoco_annotations
dataset: refcoco
splitBy: unc

View File

@ -2,7 +2,7 @@ datasets:
refcocog:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
image_path: /path/to/coco/images
ann_path: /path/to/refcoco_annotations
dataset: refcocog
splitBy: umd

View File

@ -2,7 +2,7 @@ datasets:
refcocop:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
image_path: /path/to/coco/images
ann_path: /path/to/refcoco_annotations
dataset: refcoco+
splitBy: unc

View File

@ -2,5 +2,5 @@ datasets:
flickr_CaptionToPhrase:
data_type: images
build_info:
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
ann_path: /ibex/project/c2090/minigptv2_dataset/filtered_flickr/captiontobbox.json
image_path: /path/to/filtered_flikcr/images
ann_path: /path/to/filtered_flickr/captiontobbox.json

View File

@ -2,5 +2,5 @@ datasets:
flickr_grounded_caption:
data_type: images
build_info:
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
ann_path: /ibex/project/c2090/minigptv2_dataset/filtered_flickr/groundedcaption.json
image_path: /path/to/filtered_flikcr/images
ann_path: /path/to/filtered_flikcr/groundedcaption.json

View File

@ -2,5 +2,5 @@ datasets:
flickr_ObjectToPhrase:
data_type: images
build_info:
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
ann_path: /ibex/project/c2090/minigptv2_dataset/filtered_flickr/phrasetobbox.json
image_path: /path/to/filtered_flikcr/images
ann_path: /path/to/filtered_flikcr/phrasetobbox.json

View File

@ -8,19 +8,6 @@ datasets:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
# build_info:
# # Be careful not to append minus sign (-) before split to avoid itemizing
# annotations:
# train:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
# storage:
# - /path/to/gqa/annotations/train_balanced_questions.json
# images:
# storage: /path/to/gqa/images/
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
@ -28,6 +15,7 @@ datasets:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
storage:
- /ibex/project/c2090/minigptv2_dataset/gqa/train_balanced_questions.json
- /path/to/gqa/train_balanced_questions.json
images:
storage: /ibex/project/c2090/minigptv2_dataset/gqa/images
storage: /path/to/gqa/images

View File

@ -1,12 +1,7 @@
datasets:
# llava_conversation:
# data_type: images
# build_info:
# image_path: /path/to/coco/images
# ann_path: /path/to/llava/conversation_58k.json
llava_conversation:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2090/minigptv2_dataset/llava/conversation_58k.json
image_path: /path/to/coco/images
ann_path: /path/to/llava/conversation_58k.json

View File

@ -1,12 +1,6 @@
datasets:
# llava_detail:
# data_type: images
# build_info:
# image_path: /path/to/coco/images
# ann_path: /path/to/llava/detail_23k.json
llava_detail:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2090/minigptv2_dataset/llava/detail_23k.json
image_path: /path/to/coco/images
ann_path: /path/to/llava/detail_23k.json

View File

@ -1,12 +1,7 @@
datasets:
# llava_reason:
# data_type: images
# build_info:
# image_path: /path/to/coco/images
# ann_path: /path/to/llava/complex_reasoning_77k.json
llava_reason:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2090/minigptv2_dataset/llava/complex_reasoning_77k.json
image_path: /path/to/coco/images
ann_path: /path/to/llava/complex_reasoning_77k.json

View File

@ -1,14 +1,7 @@
datasets:
# multitask_conversation:
# data_type: images
# build_info:
# image_path: /path/to/coco/images
# ann_path: /path/to/multitask_conversation/multi_task_conversation.json
multitask_conversation:
data_type: images
build_info:
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
ann_path: /ibex/project/c2090/minigptv2_dataset/multitask_conversation/multi_task_conversation.json
image_path: /path/to/coco/images
ann_path: /path/to/multitask_conversation/multi_task_conversation.json

View File

@ -1,10 +1,5 @@
datasets:
# unnatural_instruction:
# data_type: text
# build_info:
# ann_path: /path/to/unnatural-instructions/data/unnatural_instruction_filer.json
unnatural_instruction:
data_type: text
build_info:
ann_path: /ibex/project/c2090/minigptv2_dataset/unnatural_instructions/unnatural_instruction_filer.json
ann_path: /path/to/unnatural_instructions/filtered_unnatural_instruction.json

View File

@ -1,12 +1,6 @@
datasets:
# ocrvqa:
# data_type: images
# build_info:
# image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images
# ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json
ocrvqa:
data_type: images
build_info:
image_path: /ibex/project/c2090/minigptv2_dataset/ocrvqa/images
ann_path: /ibex/project/c2090/minigptv2_dataset/ocrvqa/dataset.json
image_path: /path/to/ocrvqa/images
ann_path: /path/to/ocrvqa/dataset.json

View File

@ -8,19 +8,6 @@ datasets:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]
# build_info:
# # Be careful not to append minus sign (-) before split to avoid itemizing
# annotations:
# train:
# url:
# # TODO make this order insensitive
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
# storage:
# - /path/to/okvqa/annotations/okvqa_train.json
# images:
# storage: /path/to/okvqa/images
build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
@ -28,9 +15,7 @@ datasets:
url:
# TODO make this order insensitive
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
storage:
- /ibex/project/c2090/minigptv2_dataset/okvqa/okvqa_train.json
- /path/to/okvqa/okvqa_train.json
images:
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
storage: /path/to/coco/images

View File

@ -1,16 +1,9 @@
datasets:
# textcaps_caption:
# data_type: images
# build_info:
# image_path: /path/to/TextCaps/train_images
# ann_path: /path/to/TextCaps/TextCaps_0.1_train.json
textcaps_caption:
data_type: images
build_info:
image_path: /ibex/project/c2090/minigptv2_dataset/textcaps/train_images
ann_path: /ibex/project/c2090/minigptv2_dataset/textcaps/TextCaps_0.1_train.json
image_path: /path/to/textcaps/train_images
ann_path: /path/to/textcaps/TextCaps_0.1_train.json

View File

@ -1,10 +1,5 @@
datasets:
# refvg:
# data_type: images
# build_info:
# data_dir: /path/to/visual_genome
refvg:
data_type: images
build_info:
data_dir: /ibex/project/c2090/minigptv2_dataset/visual_genome
data_dir: /path/to/visual_genome

View File

@ -101,6 +101,9 @@ class CaptionToObjectDataset(Dataset):
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
print("CaptionToObject instruction", instruction)
print("CaptionToObject answer", answer)
return {
"image": image,
"instruction_input": instruction,
@ -145,6 +148,9 @@ class PhraseToObjectDataset(Dataset):
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
print("PhraseToObject instruction", instruction)
print("PhraseToObject answer", answer)
return {
"image": image,
"instruction_input": instruction,

View File

@ -1,22 +1,17 @@
model:
arch: minigpt_v2
model_type: pretrain
freeze_vit: True
freeze_qformer: True
max_txt_len: 1024
low_resource: False
image_size: 448
end_sym: "</s>"
llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update"
ckpt: "/ibex/project/c2090/minigpt4_ckpt/448_perforamnce_correct_v10_vg/20230925064/checkpoint_32.pth"
llama_model: "/path/to/llama_checkpoint"
ckpt: "/path/to/pretrained_checkpoint"
use_grad_checkpoint: True
chat_template: True
lora_r: 64
lora_alpha: 16
datasets:
multitask_conversation:
batch_size: 2
vis_processor:
@ -26,9 +21,9 @@ datasets:
text_processor:
train:
name: "blip_caption"
sample_ratio: 40
sample_ratio: 50
llava_conversation: # 77k
llava_conversation:
batch_size: 2
vis_processor:
train:
@ -37,8 +32,7 @@ datasets:
text_processor:
train:
name: "blip_caption"
sample_ratio: 10
sample_ratio: 30
unnatural_instruction:
batch_size: 1
@ -49,7 +43,7 @@ datasets:
text_processor:
train:
name: "blip_caption"
sample_ratio: 15
sample_ratio: 5
refvg:
@ -63,7 +57,7 @@ datasets:
name: "blip_caption"
sample_ratio: 40
llava_detail: #23K
llava_detail:
batch_size: 4
vis_processor:
train:
@ -74,7 +68,7 @@ datasets:
name: "blip_caption"
sample_ratio: 20
llava_reason: # 77k
llava_reason:
batch_size: 4
vis_processor:
train:
@ -142,7 +136,7 @@ datasets:
name: "blip_caption"
sample_ratio: 10
refcoco: # 142k
refcoco:
batch_size: 6
vis_processor:
train:
@ -151,7 +145,7 @@ datasets:
text_processor:
train:
name: "blip_caption"
sample_ratio: 15
sample_ratio: 20
refcocop:
@ -163,7 +157,7 @@ datasets:
text_processor:
train:
name: "blip_caption"
sample_ratio: 15
sample_ratio: 20
refcocog:
batch_size: 6
@ -174,7 +168,7 @@ datasets:
text_processor:
train:
name: "blip_caption"
sample_ratio: 15
sample_ratio: 20
@ -212,7 +206,7 @@ datasets:
sample_ratio: 10
coco_vqa: # 82K
coco_vqa:
batch_size: 6
vis_processor:
train:
@ -223,7 +217,7 @@ datasets:
name: "blip_caption"
sample_ratio: 15
ok_vqa: # 9k
ok_vqa:
batch_size: 6
vis_processor:
train:
@ -234,7 +228,7 @@ datasets:
name: "blip_caption"
sample_ratio: 8
aok_vqa: # 17k
aok_vqa:
batch_size: 6
vis_processor:
train:
@ -245,7 +239,7 @@ datasets:
name: "blip_caption"
sample_ratio: 12
gqa: # 82K
gqa:
batch_size: 6
vis_processor:
train:
@ -254,9 +248,9 @@ datasets:
text_processor:
train:
name: "blip_caption"
sample_ratio: 40
sample_ratio: 50
ocrvqa: # 800K
ocrvqa:
batch_size: 6
vis_processor:
train:
@ -283,7 +277,7 @@ run:
iters_per_epoch: 1000
seed: 42
output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_finetune_test_online"
output_dir: "/path/to/save_checkpoint"
amp: True
resume_ckpt_path: null