MiniGPT-4/minigpt4/projects/minigpt/train/minigptv2_finetune.yaml
2023-12-01 23:17:44 +08:00

293 lines
5.9 KiB
YAML

model:
arch: minigpt_v2
model_type: pretrain
max_txt_len: 1024
image_size: 448
end_sym: "</s>"
llama_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/llama_2_7b_chat"
ckpt: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/minigptv2/checkpoint_stage2.pth"
use_grad_checkpoint: True
chat_template: True
lora_r: 64
lora_alpha: 16
datasets:
# multitask_conversation: # in-house data 12171
# batch_size: 2
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 50
llava_conversation: # 56681
batch_size: 2
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 30
unnatural_instruction: # pure text 65852
batch_size: 1
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 10
# refvg: # [refer] return the location
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 40
llava_detail: # 23240
batch_size: 4
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 20
llava_reason: # 76643
batch_size: 4
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 80
# flickr_grounded_caption: # [grounding] : TODO
# batch_size: 2
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
# flickr_CaptionToPhrase: # [detection]
# batch_size: 2
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
# flickr_ObjectToPhrase: # [detection]
# batch_size: 2
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
coco_caption: # 414113 train
batch_size: 6
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 10
textcaps_caption: # 109765 train
batch_size: 6
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 30
# refcoco:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 25
# refcocop:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 25
# refcocog:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 25
# invrefcoco:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# invrefcocop:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# invrefcocog:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
coco_vqa: # 658104
batch_size: 6
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 15
ok_vqa: # train, valid (9009, 5046)
batch_size: 6
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 8
aok_vqa: # (17056, 1145, 6702)
batch_size: 6
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 12
gqa: # (943000, 12578, 12578)
batch_size: 6
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 50
ocrvqa: # 207572
batch_size: 6
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 30
run:
task: image_text_pretrain
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-5
min_lr: 8e-5
warmup_lr: 1e-6
weight_decay: 0.05
max_epoch: 1
num_workers: 6
warmup_steps: 1000
iters_per_epoch: 1000
seed: 42
output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/minigpt/v2/vqa_pretrain_3B_llama2_7b_chat_stage3_train_linear_lora_test_1030"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: ["train"]
valid_splits: ["val"]
test_splits: ["test"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
wandb_log: True
job_name: minigptv2_finetune