model: arch: minigpt_v2 model_type: pretrain freeze_vit: True freeze_qformer: True max_txt_len: 1024 low_resource: False image_size: 448 end_sym: "" llama_model: "/ibex/project/c2133/llama_v2/llama-2-7b-chat-pytorch_update" ckpt: "/ibex/project/c2090/minigpt4_ckpt/448_perforamnce_correct_v10_vg/20230925064/checkpoint_32.pth" use_grad_checkpoint: True chat_template: True wanda_logs: False lora_r: 64 lora_alpha: 16 datasets: multitask_conversation: batch_size: 2 vis_processor: train: name: "blip2_image_train" image_size: 448 text_processor: train: name: "blip_caption" sample_ratio: 40 # llava_conversation: # 77k # batch_size: 2 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 # unnatural_instruction: # batch_size: 1 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 15 # refvg: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 40 # llava_detail: #23K # batch_size: 4 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 20 # llava_reason: # 77k # batch_size: 4 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 80 # flickr_grounded_caption: # batch_size: 2 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 80 # flickr_CaptionToPhrase: # batch_size: 2 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 80 # flickr_ObjectToPhrase: # batch_size: 2 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 80 # coco_caption: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 # textcaps_caption: # # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 # refcoco: # 142k # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 15 # refcocop: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 15 # refcocog: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 15 # invrefcoco: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 # invrefcocop: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 # invrefcocog: # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 10 # coco_vqa: # 82K # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 15 # ok_vqa: # 9k # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 8 # aok_vqa: # 17k # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 12 # gqa: # 82K # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 40 # ocrvqa: # 800K # batch_size: 6 # vis_processor: # train: # name: "blip2_image_train" # image_size: 448 # text_processor: # train: # name: "blip_caption" # sample_ratio: 30 run: task: image_text_pretrain # optimizer lr_sched: "linear_warmup_cosine_lr" init_lr: 1e-5 min_lr: 8e-5 warmup_lr: 1e-6 weight_decay: 0.05 max_epoch: 50 num_workers: 6 warmup_steps: 1000 iters_per_epoch: 1000 seed: 42 output_dir: "/ibex/project/c2090/minigpt4_ckpt/448_conversation_correct_best_v7_ablation1_v5_v6_again_system_prompt" amp: True resume_ckpt_path: null evaluate: False train_splits: ["train"] device: "cuda" world_size: 1 dist_url: "env://" distributed: True