mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-05 02:20:47 +00:00
300 lines
5.9 KiB
YAML
300 lines
5.9 KiB
YAML
model:
|
|
arch: minigpt_v2
|
|
model_type: pretrain
|
|
freeze_vit: True
|
|
freeze_qformer: True
|
|
max_txt_len: 1024
|
|
low_resource: False
|
|
image_size: 448
|
|
end_sym: "</s>"
|
|
llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update"
|
|
ckpt: "/ibex/project/c2090/minigpt4_ckpt/448_perforamnce_correct_v10_vg/20230925064/checkpoint_32.pth"
|
|
use_grad_checkpoint: True
|
|
chat_template: True
|
|
lora_r: 64
|
|
lora_alpha: 16
|
|
|
|
|
|
datasets:
|
|
|
|
|
|
multitask_conversation:
|
|
batch_size_train: 2
|
|
vis_processor:
|
|
train:
|
|
name: "blip2_image_train"
|
|
image_size: 448
|
|
text_processor:
|
|
train:
|
|
name: "blip_caption"
|
|
sample_ratio: 40
|
|
|
|
llava_conversation: # 77k
|
|
batch_size_train: 2
|
|
vis_processor:
|
|
train:
|
|
name: "blip2_image_train"
|
|
image_size: 448
|
|
text_processor:
|
|
train:
|
|
name: "blip_caption"
|
|
sample_ratio: 10
|
|
|
|
|
|
|
|
|
|
# unnatural_instruction:
|
|
# batch_size: 1
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 15
|
|
|
|
|
|
# refvg:
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 40
|
|
|
|
# llava_detail: #23K
|
|
# batch_size: 4
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 20
|
|
|
|
# llava_reason: # 77k
|
|
# batch_size: 4
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 80
|
|
|
|
|
|
# grounded_detailed_image_caption:
|
|
# batch_size: 2
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 80
|
|
|
|
# CaptionToPhrase:
|
|
# batch_size: 2
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 80
|
|
|
|
# ObjectToPhrase:
|
|
# batch_size: 2
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 80
|
|
|
|
# coco_caption:
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 10
|
|
|
|
|
|
# textcaps_caption: #
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 10
|
|
|
|
# refcoco: # 142k
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 15
|
|
|
|
|
|
# refcocop:
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 15
|
|
|
|
# refcocog:
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 15
|
|
|
|
|
|
|
|
# invrefcoco:
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 10
|
|
|
|
# invrefcocop:
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 10
|
|
|
|
# invrefcocog:
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 10
|
|
|
|
|
|
# coco_vqa: # 82K
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 15
|
|
|
|
# ok_vqa: # 9k
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 8
|
|
|
|
# aok_vqa: # 17k
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 12
|
|
|
|
# gqa: # 82K
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 40
|
|
|
|
# ocrvqa: # 800K
|
|
# batch_size: 6
|
|
# vis_processor:
|
|
# train:
|
|
# name: "blip2_image_train"
|
|
# image_size: 448
|
|
# text_processor:
|
|
# train:
|
|
# name: "blip_caption"
|
|
# sample_ratio: 30
|
|
|
|
|
|
run:
|
|
task: image_text_pretrain
|
|
# optimizer
|
|
lr_sched: "linear_warmup_cosine_lr"
|
|
init_lr: 1e-5
|
|
min_lr: 8e-5
|
|
warmup_lr: 1e-6
|
|
|
|
weight_decay: 0.05
|
|
max_epoch: 50
|
|
num_workers: 6
|
|
warmup_steps: 1000
|
|
iters_per_epoch: 1000
|
|
|
|
seed: 42
|
|
output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_conversation_correct_best_v7_ablation1_v5_v6_again_system_prompt"
|
|
|
|
amp: True
|
|
resume_ckpt_path: null
|
|
|
|
evaluate: False
|
|
train_splits: ["train"]
|
|
|
|
device: "cuda"
|
|
world_size: 1
|
|
dist_url: "env://"
|
|
distributed: True |