mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-05 18:40:46 +00:00
update dataset readme
This commit is contained in:
parent
f976c7800f
commit
89878d661e
2
.gitignore
vendored
2
.gitignore
vendored
@ -179,3 +179,5 @@ jobs/
|
||||
*.slurm
|
||||
slurm*
|
||||
sbatch_generate*
|
||||
eval_data/
|
||||
dataset/Evaluation.md
|
22
MiniGPTv2_Train .md
Normal file
22
MiniGPTv2_Train .md
Normal file
@ -0,0 +1,22 @@
|
||||
## Finetune of MiniGPT-4
|
||||
|
||||
The training of MiniGPT-4 contains two alignment stages.
|
||||
|
||||
**1. First pretraining stage**
|
||||
|
||||
In the first pretrained stage, the model is trained using image-text pairs from Laion and CC datasets
|
||||
to align the vision and language model. To download and prepare the datasets, please check
|
||||
our [first stage dataset preparation instruction](dataset/README_1_STAGE.md).
|
||||
After the first stage, the visual features are mapped and can be understood by the language
|
||||
model.
|
||||
To launch the first stage training, run the following command. In our experiments, we use 4 A100.
|
||||
You can change the save path in the config file
|
||||
[train_configs/minigpt4_stage1_pretrain.yaml](train_configs/minigpt4_stage1_pretrain.yaml)
|
||||
|
||||
```bash
|
||||
torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/minigpt4_stage1_pretrain.yaml
|
||||
```
|
||||
|
||||
A MiniGPT-4 checkpoint with only stage one training can be downloaded
|
||||
[here (13B)](https://drive.google.com/file/d/1u9FRRBB3VovP1HxCAlpD9Lw4t4P6-Yq8/view?usp=share_link) or [here (7B)](https://drive.google.com/file/d/1HihQtCEXUyBM1i9DQbaK934wW3TZi-h5/view?usp=share_link).
|
||||
Compared to the model after stage two, this checkpoint generate incomplete and repeated sentences frequently.
|
@ -93,9 +93,10 @@ Then, set the variable *llama_model* in the model config file to the LLM weight
|
||||
Download the pretrained model checkpoints
|
||||
|
||||
|
||||
| MiniGPT-v2 (LLaMA-2 Chat 7B) |
|
||||
|------------------------------|
|
||||
| [Download](https://drive.google.com/file/d/1aVbfW7nkCSYx99_vCRyP1sOlQiWVSnAl/view?usp=sharing) |
|
||||
| MiniGPT-v2 (developing model (online demo)) | MiniGPT-v2 (after stage-2) | MiniGPT-v2 (after stage-3)
|
||||
|------------------------------|------------------------------|------------------------------|
|
||||
| [Download](https://drive.google.com/file/d/1aVbfW7nkCSYx99_vCRyP1sOlQiWVSnAl/view?usp=sharing) |[Download](https://drive.google.com/file/d/1Vi_E7ZtZXRAQcyz4f8E6LtLh2UXABCmu/view?usp=sharing) |[Download](https://drive.google.com/file/d/1jAbxUiyl04SFJMN4sF1vvUU69Etuz4qa/view?usp=sharing) |
|
||||
|
||||
|
||||
For **MiniGPT-v2**, set the path to the pretrained checkpoint in the evaluation config file
|
||||
in [eval_configs/minigptv2_eval.yaml](eval_configs/minigptv2_eval.yaml#L10) at Line 8.
|
||||
@ -146,6 +147,7 @@ Thanks [@WangRongsheng](https://github.com/WangRongsheng), you can also run Mini
|
||||
### Training
|
||||
For training details of MiniGPT-4, check [here](MiniGPT4_Train.md).
|
||||
|
||||
For finetuning details of MiniGPT-v2, check [here](MiniGPTv2_Train.md)
|
||||
|
||||
|
||||
|
||||
|
@ -26,11 +26,22 @@ LLaVA | <a href="https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/
|
||||
### COCO captions
|
||||
Download the COCO 2014 images and captions
|
||||
|
||||
coco 2014 images path
|
||||
|
||||
```
|
||||
${MINIGPTv2_DATASET}
|
||||
├── coco
|
||||
│ ├── images
|
||||
│ ...
|
||||
...
|
||||
```
|
||||
|
||||
|
||||
coco caption annotation path
|
||||
|
||||
```
|
||||
${MINIGPTv2_DATASET}
|
||||
├── coco_captions
|
||||
│ ├── coco_images
|
||||
│ └── annotations
|
||||
│ ├── coco_karpathy_train.json
|
||||
│ ...
|
||||
|
@ -10,20 +10,11 @@ datasets:
|
||||
|
||||
build_info:
|
||||
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
# annotations:
|
||||
# train:
|
||||
# url:
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
|
||||
# storage:
|
||||
# - /path/to/aokvqa/annotations/aokvqa_v1p0_train.json
|
||||
# images:
|
||||
# storage: /path/to/coco/images/
|
||||
|
||||
annotations:
|
||||
train:
|
||||
url:
|
||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
|
||||
storage:
|
||||
- /ibex/project/c2090/minigptv2_dataset/aokvqa/aokvqa_v1p0_train.json
|
||||
- /path/to/aokvqa_v1p0_train.json
|
||||
images:
|
||||
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/
|
||||
storage: /path/to/coco/images
|
@ -9,30 +9,13 @@ datasets:
|
||||
# data_dir: ${env.data_dir}/datasets
|
||||
data_type: images # [images|videos|features]
|
||||
|
||||
# build_info:
|
||||
# # Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
# annotations:
|
||||
# train:
|
||||
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
|
||||
# md5: aa31ac474cf6250ebb81d18348a07ed8
|
||||
# storage: /path/to/coco_caption/annotations/coco_karpathy_train.json
|
||||
# images:
|
||||
# storage: /path/to/coco/images/
|
||||
|
||||
build_info:
|
||||
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
annotations:
|
||||
train:
|
||||
url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
|
||||
md5: aa31ac474cf6250ebb81d18348a07ed8
|
||||
storage: /ibex/project/c2090/minigptv2_dataset/coco_captions/coco_karpathy_train.json
|
||||
# val:
|
||||
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
|
||||
# md5: b273847456ef5580e33713b1f7de52a0
|
||||
# storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_val.json
|
||||
# test:
|
||||
# url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
|
||||
# md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
|
||||
# storage: /ibex/project/c2133/minigpt4_v2_dataset/coco_caption/annotations/coco_karpathy_test.json
|
||||
storage: /path/to/coco_caption/coco_karpathy_train.json
|
||||
images:
|
||||
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
|
||||
storage: /path/to/coco/images
|
||||
|
||||
|
@ -10,24 +10,15 @@ datasets:
|
||||
|
||||
build_info:
|
||||
|
||||
# annotations:
|
||||
# train:
|
||||
# url:
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
|
||||
# storage:
|
||||
# - /path/to/vqav2/annotations/vqa_train.json
|
||||
# - /path/to/vqav2/coco/annotations/vqa_val.json
|
||||
# images:
|
||||
# storage: /path/to/coco/images/
|
||||
|
||||
annotations:
|
||||
train:
|
||||
url:
|
||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
|
||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
|
||||
storage:
|
||||
- /ibex/project/c2090/minigptv2_dataset/vqav2/vqa_train.json
|
||||
- /ibex/project/c2090/minigptv2_dataset/vqav2/vqa_val.json
|
||||
- /path/to/vqav2/vqa_train.json
|
||||
- /path/to/vqav2/vqa_val.json
|
||||
images:
|
||||
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
|
||||
storage: /path/to/coco/images
|
||||
|
||||
|
@ -2,7 +2,7 @@ datasets:
|
||||
invrefcoco:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
|
||||
image_path: /path/to/coco/images
|
||||
ann_path: /path/to/refcoco_annotations
|
||||
dataset: invrefcoco
|
||||
splitBy: unc
|
@ -2,7 +2,7 @@ datasets:
|
||||
invrefcocog:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
|
||||
image_path: /path/to/coco/images
|
||||
ann_path: /path/to/refcoco_annotations
|
||||
dataset: invrefcocog
|
||||
splitBy: umd
|
@ -2,7 +2,7 @@ datasets:
|
||||
invrefcocop:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
|
||||
image_path: /path/to/coco/images
|
||||
ann_path: /path/to/refcoco_annotations
|
||||
dataset: invrefcoco+
|
||||
splitBy: unc
|
@ -2,7 +2,7 @@ datasets:
|
||||
refcoco:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
|
||||
image_path: /path/to/coco/images
|
||||
ann_path: /path/to/refcoco_annotations
|
||||
dataset: refcoco
|
||||
splitBy: unc
|
@ -2,7 +2,7 @@ datasets:
|
||||
refcocog:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
|
||||
image_path: /path/to/coco/images
|
||||
ann_path: /path/to/refcoco_annotations
|
||||
dataset: refcocog
|
||||
splitBy: umd
|
@ -2,7 +2,7 @@ datasets:
|
||||
refcocop:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/refcoco_annotations
|
||||
image_path: /path/to/coco/images
|
||||
ann_path: /path/to/refcoco_annotations
|
||||
dataset: refcoco+
|
||||
splitBy: unc
|
@ -2,5 +2,5 @@ datasets:
|
||||
flickr_CaptionToPhrase:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/filtered_flickr/captiontobbox.json
|
||||
image_path: /path/to/filtered_flikcr/images
|
||||
ann_path: /path/to/filtered_flickr/captiontobbox.json
|
||||
|
@ -2,5 +2,5 @@ datasets:
|
||||
flickr_grounded_caption:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/filtered_flickr/groundedcaption.json
|
||||
image_path: /path/to/filtered_flikcr/images
|
||||
ann_path: /path/to/filtered_flikcr/groundedcaption.json
|
||||
|
@ -2,5 +2,5 @@ datasets:
|
||||
flickr_ObjectToPhrase:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/project/c2133/minigpt4_v2_dataset/flickr/images
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/filtered_flickr/phrasetobbox.json
|
||||
image_path: /path/to/filtered_flikcr/images
|
||||
ann_path: /path/to/filtered_flikcr/phrasetobbox.json
|
||||
|
@ -8,19 +8,6 @@ datasets:
|
||||
# data_dir: ${env.data_dir}/datasets
|
||||
data_type: images # [images|videos|features]
|
||||
|
||||
# build_info:
|
||||
# # Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
# annotations:
|
||||
# train:
|
||||
# url:
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
|
||||
# storage:
|
||||
# - /path/to/gqa/annotations/train_balanced_questions.json
|
||||
|
||||
# images:
|
||||
# storage: /path/to/gqa/images/
|
||||
|
||||
|
||||
build_info:
|
||||
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
annotations:
|
||||
@ -28,6 +15,7 @@ datasets:
|
||||
url:
|
||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
|
||||
storage:
|
||||
- /ibex/project/c2090/minigptv2_dataset/gqa/train_balanced_questions.json
|
||||
- /path/to/gqa/train_balanced_questions.json
|
||||
|
||||
images:
|
||||
storage: /ibex/project/c2090/minigptv2_dataset/gqa/images
|
||||
storage: /path/to/gqa/images
|
||||
|
@ -1,12 +1,7 @@
|
||||
datasets:
|
||||
# llava_conversation:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
# image_path: /path/to/coco/images
|
||||
# ann_path: /path/to/llava/conversation_58k.json
|
||||
|
||||
llava_conversation:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/llava/conversation_58k.json
|
||||
image_path: /path/to/coco/images
|
||||
ann_path: /path/to/llava/conversation_58k.json
|
@ -1,12 +1,6 @@
|
||||
datasets:
|
||||
# llava_detail:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
# image_path: /path/to/coco/images
|
||||
# ann_path: /path/to/llava/detail_23k.json
|
||||
|
||||
llava_detail:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/llava/detail_23k.json
|
||||
image_path: /path/to/coco/images
|
||||
ann_path: /path/to/llava/detail_23k.json
|
@ -1,12 +1,7 @@
|
||||
datasets:
|
||||
# llava_reason:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
# image_path: /path/to/coco/images
|
||||
# ann_path: /path/to/llava/complex_reasoning_77k.json
|
||||
|
||||
llava_reason:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/llava/complex_reasoning_77k.json
|
||||
image_path: /path/to/coco/images
|
||||
ann_path: /path/to/llava/complex_reasoning_77k.json
|
@ -1,14 +1,7 @@
|
||||
datasets:
|
||||
# multitask_conversation:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
|
||||
# image_path: /path/to/coco/images
|
||||
# ann_path: /path/to/multitask_conversation/multi_task_conversation.json
|
||||
|
||||
multitask_conversation:
|
||||
data_type: images
|
||||
build_info:
|
||||
|
||||
image_path: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg/train
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/multitask_conversation/multi_task_conversation.json
|
||||
image_path: /path/to/coco/images
|
||||
ann_path: /path/to/multitask_conversation/multi_task_conversation.json
|
@ -1,10 +1,5 @@
|
||||
datasets:
|
||||
# unnatural_instruction:
|
||||
# data_type: text
|
||||
# build_info:
|
||||
# ann_path: /path/to/unnatural-instructions/data/unnatural_instruction_filer.json
|
||||
|
||||
unnatural_instruction:
|
||||
data_type: text
|
||||
build_info:
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/unnatural_instructions/unnatural_instruction_filer.json
|
||||
ann_path: /path/to/unnatural_instructions/filtered_unnatural_instruction.json
|
@ -1,12 +1,6 @@
|
||||
datasets:
|
||||
# ocrvqa:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
# image_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/images
|
||||
# ann_path: /ibex/project/c2133/minigpt4_v2_dataset/ocrvqa/dataset.json
|
||||
|
||||
ocrvqa:
|
||||
data_type: images
|
||||
build_info:
|
||||
image_path: /ibex/project/c2090/minigptv2_dataset/ocrvqa/images
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/ocrvqa/dataset.json
|
||||
image_path: /path/to/ocrvqa/images
|
||||
ann_path: /path/to/ocrvqa/dataset.json
|
@ -8,19 +8,6 @@ datasets:
|
||||
# data_dir: ${env.data_dir}/datasets
|
||||
data_type: images # [images|videos|features]
|
||||
|
||||
# build_info:
|
||||
# # Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
# annotations:
|
||||
# train:
|
||||
# url:
|
||||
# # TODO make this order insensitive
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
||||
# storage:
|
||||
# - /path/to/okvqa/annotations/okvqa_train.json
|
||||
# images:
|
||||
# storage: /path/to/okvqa/images
|
||||
|
||||
|
||||
build_info:
|
||||
# Be careful not to append minus sign (-) before split to avoid itemizing
|
||||
annotations:
|
||||
@ -28,9 +15,7 @@ datasets:
|
||||
url:
|
||||
# TODO make this order insensitive
|
||||
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
|
||||
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
|
||||
storage:
|
||||
- /ibex/project/c2090/minigptv2_dataset/okvqa/okvqa_train.json
|
||||
- /path/to/okvqa/okvqa_train.json
|
||||
images:
|
||||
storage: /ibex/reference/CV/COCO/cocoapi/data/2014/images/jpeg
|
||||
storage: /path/to/coco/images
|
@ -1,16 +1,9 @@
|
||||
datasets:
|
||||
# textcaps_caption:
|
||||
# data_type: images
|
||||
|
||||
# build_info:
|
||||
# image_path: /path/to/TextCaps/train_images
|
||||
# ann_path: /path/to/TextCaps/TextCaps_0.1_train.json
|
||||
|
||||
textcaps_caption:
|
||||
data_type: images
|
||||
|
||||
build_info:
|
||||
image_path: /ibex/project/c2090/minigptv2_dataset/textcaps/train_images
|
||||
ann_path: /ibex/project/c2090/minigptv2_dataset/textcaps/TextCaps_0.1_train.json
|
||||
image_path: /path/to/textcaps/train_images
|
||||
ann_path: /path/to/textcaps/TextCaps_0.1_train.json
|
||||
|
||||
|
||||
|
@ -1,10 +1,5 @@
|
||||
datasets:
|
||||
# refvg:
|
||||
# data_type: images
|
||||
# build_info:
|
||||
# data_dir: /path/to/visual_genome
|
||||
|
||||
refvg:
|
||||
data_type: images
|
||||
build_info:
|
||||
data_dir: /ibex/project/c2090/minigptv2_dataset/visual_genome
|
||||
data_dir: /path/to/visual_genome
|
@ -101,6 +101,9 @@ class CaptionToObjectDataset(Dataset):
|
||||
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
print("CaptionToObject instruction", instruction)
|
||||
print("CaptionToObject answer", answer)
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
@ -145,6 +148,9 @@ class PhraseToObjectDataset(Dataset):
|
||||
|
||||
instruction = "<Img><ImageHere></Img> {} ".format(instruction)
|
||||
|
||||
print("PhraseToObject instruction", instruction)
|
||||
print("PhraseToObject answer", answer)
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"instruction_input": instruction,
|
||||
|
@ -1,22 +1,17 @@
|
||||
model:
|
||||
arch: minigpt_v2
|
||||
model_type: pretrain
|
||||
freeze_vit: True
|
||||
freeze_qformer: True
|
||||
max_txt_len: 1024
|
||||
low_resource: False
|
||||
image_size: 448
|
||||
end_sym: "</s>"
|
||||
llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update"
|
||||
ckpt: "/ibex/project/c2090/minigpt4_ckpt/448_perforamnce_correct_v10_vg/20230925064/checkpoint_32.pth"
|
||||
llama_model: "/path/to/llama_checkpoint"
|
||||
ckpt: "/path/to/pretrained_checkpoint"
|
||||
use_grad_checkpoint: True
|
||||
chat_template: True
|
||||
lora_r: 64
|
||||
lora_alpha: 16
|
||||
|
||||
|
||||
datasets:
|
||||
|
||||
multitask_conversation:
|
||||
batch_size: 2
|
||||
vis_processor:
|
||||
@ -26,9 +21,9 @@ datasets:
|
||||
text_processor:
|
||||
train:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 40
|
||||
sample_ratio: 50
|
||||
|
||||
llava_conversation: # 77k
|
||||
llava_conversation:
|
||||
batch_size: 2
|
||||
vis_processor:
|
||||
train:
|
||||
@ -37,8 +32,7 @@ datasets:
|
||||
text_processor:
|
||||
train:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 10
|
||||
|
||||
sample_ratio: 30
|
||||
|
||||
unnatural_instruction:
|
||||
batch_size: 1
|
||||
@ -49,7 +43,7 @@ datasets:
|
||||
text_processor:
|
||||
train:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 15
|
||||
sample_ratio: 5
|
||||
|
||||
|
||||
refvg:
|
||||
@ -63,7 +57,7 @@ datasets:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 40
|
||||
|
||||
llava_detail: #23K
|
||||
llava_detail:
|
||||
batch_size: 4
|
||||
vis_processor:
|
||||
train:
|
||||
@ -74,7 +68,7 @@ datasets:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 20
|
||||
|
||||
llava_reason: # 77k
|
||||
llava_reason:
|
||||
batch_size: 4
|
||||
vis_processor:
|
||||
train:
|
||||
@ -142,7 +136,7 @@ datasets:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 10
|
||||
|
||||
refcoco: # 142k
|
||||
refcoco:
|
||||
batch_size: 6
|
||||
vis_processor:
|
||||
train:
|
||||
@ -151,7 +145,7 @@ datasets:
|
||||
text_processor:
|
||||
train:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 15
|
||||
sample_ratio: 20
|
||||
|
||||
|
||||
refcocop:
|
||||
@ -163,7 +157,7 @@ datasets:
|
||||
text_processor:
|
||||
train:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 15
|
||||
sample_ratio: 20
|
||||
|
||||
refcocog:
|
||||
batch_size: 6
|
||||
@ -174,7 +168,7 @@ datasets:
|
||||
text_processor:
|
||||
train:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 15
|
||||
sample_ratio: 20
|
||||
|
||||
|
||||
|
||||
@ -212,7 +206,7 @@ datasets:
|
||||
sample_ratio: 10
|
||||
|
||||
|
||||
coco_vqa: # 82K
|
||||
coco_vqa:
|
||||
batch_size: 6
|
||||
vis_processor:
|
||||
train:
|
||||
@ -223,7 +217,7 @@ datasets:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 15
|
||||
|
||||
ok_vqa: # 9k
|
||||
ok_vqa:
|
||||
batch_size: 6
|
||||
vis_processor:
|
||||
train:
|
||||
@ -234,7 +228,7 @@ datasets:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 8
|
||||
|
||||
aok_vqa: # 17k
|
||||
aok_vqa:
|
||||
batch_size: 6
|
||||
vis_processor:
|
||||
train:
|
||||
@ -245,7 +239,7 @@ datasets:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 12
|
||||
|
||||
gqa: # 82K
|
||||
gqa:
|
||||
batch_size: 6
|
||||
vis_processor:
|
||||
train:
|
||||
@ -254,9 +248,9 @@ datasets:
|
||||
text_processor:
|
||||
train:
|
||||
name: "blip_caption"
|
||||
sample_ratio: 40
|
||||
sample_ratio: 50
|
||||
|
||||
ocrvqa: # 800K
|
||||
ocrvqa:
|
||||
batch_size: 6
|
||||
vis_processor:
|
||||
train:
|
||||
@ -283,7 +277,7 @@ run:
|
||||
iters_per_epoch: 1000
|
||||
|
||||
seed: 42
|
||||
output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_finetune_test_online"
|
||||
output_dir: "/path/to/save_checkpoint"
|
||||
|
||||
amp: True
|
||||
resume_ckpt_path: null
|
||||
|
Loading…
Reference in New Issue
Block a user