model: arch: minigpt_v2 model_type: pretrain max_txt_len: 1024 image_size: 448 end_sym: "" llama_model: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/llama_2_7b_chat" ckpt: "/mnt/pfs-guan-ssai/nlu/wanghanzi/models/minigptv2/checkpoint_stage2.pth" use_grad_checkpoint: True chat_template: True lora_r: 64 lora_alpha: 16 datasets: # multitask_conversation: # in-house data 12171 # batch_size: 2 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 50 llava_conversation: # 56681 batch_size: 2 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 30 unnatural_instruction: # pure text 65852 batch_size: 1 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 10 # refvg: # [refer] return the location # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 40 llava_detail: # 23240 batch_size: 4 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 20 llava_reason: # 76643 batch_size: 4 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 80 # flickr_grounded_caption: # [grounding] : TODO # batch_size: 2 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 80 # flickr_CaptionToPhrase: # [detection] # batch_size: 2 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 80 # flickr_ObjectToPhrase: # [detection] # batch_size: 2 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 80 coco_caption: # 414113 train batch_size: 6 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 10 textcaps_caption: # 109765 train batch_size: 6 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 30 # refcoco: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 25 # refcocop: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 25 # refcocog: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 25 # invrefcoco: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 # invrefcocop: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 # invrefcocog: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 coco_vqa: # 658104 batch_size: 6 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 15 ok_vqa: # train, valid (9009, 5046) batch_size: 6 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 8 aok_vqa: # (17056, 1145, 6702) batch_size: 6 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 12 gqa: # (943000, 12578, 12578) batch_size: 6 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 50 ocrvqa: # 207572 batch_size: 6 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 30 run: task: image_text_pretrain # optimizer lr_sched: "linear_warmup_cosine_lr" init_lr: 1e-5 min_lr: 8e-5 warmup_lr: 1e-6 weight_decay: 0.05 max_epoch: 1 num_workers: 6 warmup_steps: 1000 iters_per_epoch: 1000 seed: 42 output_dir: "/mnt/pfs-guan-ssai/nlu/wanghanzi/experiments/blip2/minigpt/v2/vqa_pretrain_3B_llama2_7b_chat_stage3_train_linear_lora_test_1030" amp: True resume_ckpt_path: null evaluate: False train_splits: ["train"] valid_splits: ["val"] test_splits: ["test"] device: "cuda" world_size: 1 dist_url: "env://" distributed: True wandb_log: True job_name: minigptv2_finetune