model: arch: minigpt_v2 model_type: pretrain max_txt_len: 1024 image_size: 448 end_sym: "" llama_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/llama_2_7b_chat" ckpt: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/minigptv2/checkpoint_stage2.pth" use_grad_checkpoint: True chat_template: True lora_r: 64 lora_alpha: 16 datasets: # llava_conversation: # 56681 # batch_size: 2 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 30 # unnatural_instruction: # pure text 65852 # batch_size: 1 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 # llava_detail: # 23240 # batch_size: 4 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 20 llava_reason: # 76643 batch_size: 4 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 80 # coco_caption: # 414113 train # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 # textcaps_caption: # 109765 train # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 30 # coco_vqa: # 658104 # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 15 ok_vqa: # train, valid (9009, 5046) batch_size: 6 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 8 # aok_vqa: # train: 17056, val: 1145 # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 12 # gqa: # train: 943000, 12578, 12578) # type: balanced_sft_raw # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 50 # ocrvqa: # train 207572 # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 30 run: task: image_text_pretrain # optimizer lr_sched: "linear_warmup_cosine_lr" init_lr: 1e-5 min_lr: 8e-5 warmup_lr: 1e-6 weight_decay: 0.05 max_epoch: 1 num_workers: 6 warmup_steps: 1000 iters_per_epoch: 1000 seed: 42 output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/minigpt/v2/vqa_pretrain_3B_llama2_7b_chat_stage3_train_linear_lora_test_1030" amp: True resume_ckpt_path: null evaluate: False train_splits: ["train"] valid_splits: ["val"] test_splits: ["test"] device: "cuda" world_size: 1 dist_url: "env://" distributed: True wandb_log: False job_name: minigptv2_finetune