MiniGPT-4/train_configs/minigpt_v2_finetune.yaml
2023-10-22 21:37:45 +03:00

300 lines
5.9 KiB
YAML

model:
arch: minigpt_v2
model_type: pretrain
freeze_vit: True
freeze_qformer: True
max_txt_len: 1024
low_resource: False
image_size: 448
end_sym: "</s>"
llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update"
ckpt: "/ibex/project/c2090/minigpt4_ckpt/448_perforamnce_correct_v10_vg/20230925064/checkpoint_32.pth"
use_grad_checkpoint: True
chat_template: True
lora_r: 64
lora_alpha: 16
datasets:
multitask_conversation:
batch_size_train: 2
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 40
llava_conversation: # 77k
batch_size_train: 2
vis_processor:
train:
name: "blip2_image_train"
image_size: 448
text_processor:
train:
name: "blip_caption"
sample_ratio: 10
# unnatural_instruction:
# batch_size: 1
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 15
# refvg:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 40
# llava_detail: #23K
# batch_size: 4
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 20
# llava_reason: # 77k
# batch_size: 4
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
# grounded_detailed_image_caption:
# batch_size: 2
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
# CaptionToPhrase:
# batch_size: 2
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
# ObjectToPhrase:
# batch_size: 2
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 80
# coco_caption:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# textcaps_caption: #
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# refcoco: # 142k
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 15
# refcocop:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 15
# refcocog:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 15
# invrefcoco:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# invrefcocop:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# invrefcocog:
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 10
# coco_vqa: # 82K
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 15
# ok_vqa: # 9k
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 8
# aok_vqa: # 17k
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 12
# gqa: # 82K
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 40
# ocrvqa: # 800K
# batch_size: 6
# vis_processor:
# train:
# name: "blip2_image_train"
# image_size: 448
# text_processor:
# train:
# name: "blip_caption"
# sample_ratio: 30
run:
task: image_text_pretrain
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-5
min_lr: 8e-5
warmup_lr: 1e-6
weight_decay: 0.05
max_epoch: 50
num_workers: 6
warmup_steps: 1000
iters_per_epoch: 1000
seed: 42
output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_conversation_correct_best_v7_ablation1_v5_v6_again_system_prompt"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: ["train"]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True